Update GEMM assembly kernels

- Introduce Fp32 kernels with internal calculations in Bfloat16 when fast_mode is enabled - Improve kernel selection heuristics Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I68a9e7e862b6fd2721b46e0d7cc791091c4ab279 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5965 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2021-07-16 16:16:43 +0100
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2021-07-22 02:25:50 +0000
commit: 4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2 (patch)
tree: 2f8362d33cdad4212f4b96995681c68184c759e1
parent: 59fd7a722e5bc7e85309d6200bc37a772721a719 (diff)
download: ComputeLibrary-4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2.tar.gz
240 files changed, 77538 insertions, 10441 deletions
diff --git a/Android.bp b/Android.bp
index 1d9ec1c9c1..e5bb7a6a80 100644
--- a/Android.bp
+++ b/Android.bp
@@ -198,12 +198,15 @@ cc_library_static {
         "src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp",
         "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp",
+        "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp",
         "src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp",
         "src/core/NEON/kernels/arm_gemm/mergeresults.cpp",
         "src/core/NEON/kernels/arm_gemm/misc.cpp",
         "src/core/NEON/kernels/arm_gemm/quantized.cpp",
         "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp",
         "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp",
+        "src/core/NEON/kernels/arm_gemm/transform-sve.cpp",
+        "src/core/NEON/kernels/arm_gemm/transform.cpp",
         "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp",
         "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp",
         "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp",
@@ -838,30 +841,52 @@ cc_library_static {
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp",
@@ -880,23 +905,39 @@ cc_library_static {
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp",
diff --git a/SConstruct b/SConstruct
index 8fa78b4c6f..8dbb68952c 100644
--- a/SConstruct
+++ b/SConstruct
@@ -309,7 +309,8 @@ if env['fat_binary']:
     if env['arch'] != 'armv8.2-a':
         print("Currently fat binary is only supported with armv8.2-a")
         Exit(1)
-    env.Append(CXXFLAGS = ['-DENABLE_SVE', '-DARM_COMPUTE_ENABLE_SVE'])
+    env.Append(CXXFLAGS = ['-DENABLE_SVE', '-DARM_COMPUTE_ENABLE_SVE',
+                           '-DARM_COMPUTE_ENABLE_BF16', '-DARM_COMPUTE_ENABLE_I8MM', '-DARM_COMPUTE_ENABLE_SVEF32MM'])
     env.Append(CXXFLAGS = ['-DENABLE_NEON', '-DARM_COMPUTE_ENABLE_NEON'])
 
 if env['data_type_support']:
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index 4484271d63..76378d27ef 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -34,13 +34,15 @@ namespace arm_compute
     X(GENERIC)                     \
     X(GENERIC_FP16)                \
     X(GENERIC_FP16_DOT)            \
-    X(A35)                         \
     X(A53)                         \
     X(A55r0)                       \
     X(A55r1)                       \
+    X(A35)                         \
     X(A73)                         \
-    X(KLEIN)                       \
-    X(X1)
+    X(A510)                        \
+    X(X1)                          \
+    X(V1)                          \
+    X(A64FX)
 
 /** CPU models types
  *
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index f6658e7544..9c00cbc88c 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1948,6 +1948,7 @@ public:
           _reinterpret_input_as_3d(false),
           _retain_internal_weights(false),
           _gemmlowp_output_stage(),
+          _fast_math(false),
           _fp_mixed_precision(false),
           _broadcast_bias(false),
           _pretranpose_B(true),
@@ -1967,12 +1968,13 @@ public:
      * @param[in] retain_internal_weights     (Optional) Retain the weights tensor from previous run
      * @param[in] gemmlowp_output_stage       (Optional) GEMMLowp Output stage info
      * @param[in] fp_mixed_precision          (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
+     * @param[in] fast_math                   (Optional) Use a data type of shorter width to improve performance
      * @param[in] broadcast_bias              (Optional) Broadcast the shape of the bias tensor from a vector to a matrix.
      * @param[in] activation_info             (Optional) Activation to apply after the matrix multiplication
      * @param[in] constant_weights            (Optional) Weights have constant values throughout multiple executions
      */
     GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool retain_internal_weights = false,
-             GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool broadcast_bias = false,
+             GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool fast_math = false, bool broadcast_bias = false,
              const ActivationLayerInfo &activation_info = ActivationLayerInfo(), bool constant_weights = true) noexcept
         : _is_a_reshaped(is_a_reshaped),
           _is_b_reshaped(is_b_reshaped),
@@ -1981,6 +1983,7 @@ public:
           _reinterpret_input_as_3d(reinterpret_input_as_3d),
           _retain_internal_weights(retain_internal_weights),
           _gemmlowp_output_stage(gemmlowp_output_stage),
+          _fast_math(fast_math),
           _fp_mixed_precision(fp_mixed_precision),
           _broadcast_bias(broadcast_bias),
           _pretranpose_B(reshape_b_only_on_first_run),
@@ -2062,6 +2065,14 @@ public:
     {
         return _fp_mixed_precision;
     };
+    /** Flag which specifies if a shorter accumulator to be used.
+     *
+     * @return True if a shorter accumulator has to be used
+     */
+    bool fast_math() const
+    {
+        return _fast_math;
+    };
     /** Flag which specifies whether to broadcast the shape of the bias tensor.
      *
      * @return True if the shape of the bias tensor is to be broadcasted.
@@ -2119,6 +2130,7 @@ private:
     bool                    _reinterpret_input_as_3d;
     bool                    _retain_internal_weights;
     GEMMLowpOutputStageInfo _gemmlowp_output_stage;
+    bool                    _fast_math;
     bool                    _fp_mixed_precision;
     bool                    _broadcast_bias;
     bool                    _pretranpose_B;
diff --git a/filelist.json b/filelist.json
index 68e6aebf4f..e256744aab 100644
--- a/filelist.json
+++ b/filelist.json
@@ -1210,12 +1210,14 @@
             "src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp",
+            "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp",
             "src/core/NEON/kernels/arm_gemm/mergeresults.cpp",
             "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp",
             "src/core/NEON/kernels/arm_gemm/misc.cpp",
             "src/core/NEON/kernels/arm_gemm/quantized.cpp",
             "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp",
-            "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp"
+            "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp",
+            "src/core/NEON/kernels/arm_gemm/transform.cpp"
           ],
           "neon": {
             "estate32": [
@@ -1234,30 +1236,52 @@
               "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp",
-              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp",
@@ -1280,29 +1304,46 @@
           },
           "sve": {
             "all": [
-              "src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp",
-              "src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp",
-              "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp"
+              "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp",
+              "src/core/NEON/kernels/arm_gemm/transform-sve.cpp"
             ]
           }
         }
diff --git a/src/common/cpuinfo/CpuModel.cpp b/src/common/cpuinfo/CpuModel.cpp
index 9f4d5d1433..2328f62515 100644
--- a/src/common/cpuinfo/CpuModel.cpp
+++ b/src/common/cpuinfo/CpuModel.cpp
@@ -50,8 +50,10 @@ bool model_supports_fp16(CpuModel model)
         case CpuModel::GENERIC_FP16:
         case CpuModel::GENERIC_FP16_DOT:
         case CpuModel::A55r1:
+        case CpuModel::A510:
         case CpuModel::X1:
-        case CpuModel::KLEIN:
+        case CpuModel::V1:
+        case CpuModel::A64FX:
             return true;
         default:
             return false;
@@ -64,8 +66,10 @@ bool model_supports_dot(CpuModel model)
     {
         case CpuModel::GENERIC_FP16_DOT:
         case CpuModel::A55r1:
+        case CpuModel::A510:
         case CpuModel::X1:
-        case CpuModel::KLEIN:
+        case CpuModel::V1:
+        case CpuModel::A64FX:
             return true;
         default:
             return false;
@@ -76,7 +80,9 @@ bool model_supports_sve(CpuModel model)
 {
     switch(model)
     {
-        case CpuModel::KLEIN:
+        case CpuModel::A510:
+        case CpuModel::V1:
+        case CpuModel::A64FX:
             return true;
         default:
             return false;
@@ -92,9 +98,9 @@ CpuModel midr_to_model(uint32_t midr)
     const int variant     = (midr >> 20) & 0xF;
     const int cpunum      = (midr >> 4) & 0xFFF;
 
+    // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
     if(implementer == 0x41) // Arm CPUs
     {
-        // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
         switch(cpunum)
         {
             case 0xd03: // A53
@@ -134,11 +140,26 @@ CpuModel midr_to_model(uint32_t midr)
             case 0xd4a: // E1
                 model = CpuModel::GENERIC_FP16_DOT;
                 break;
+            case 0xd40: // V1
+                model = CpuModel::V1;
+                break;
             case 0xd44: // X1
                 model = CpuModel::X1;
                 break;
             case 0xd46:
-                model = CpuModel::KLEIN;
+                model = CpuModel::A510;
+                break;
+            default:
+                model = CpuModel::GENERIC;
+                break;
+        }
+    }
+    else if(implementer == 0x46)
+    {
+        switch(cpunum)
+        {
+            case 0x001: // A64FX
+                model = CpuModel::A64FX;
                 break;
             default:
                 model = CpuModel::GENERIC;
@@ -147,7 +168,6 @@ CpuModel midr_to_model(uint32_t midr)
     }
     else if(implementer == 0x48)
     {
-        // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
         switch(cpunum)
         {
             case 0xd40: // A76
@@ -160,7 +180,6 @@ CpuModel midr_to_model(uint32_t midr)
     }
     else if(implementer == 0x51)
     {
-        // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
         switch(cpunum)
         {
             case 0x800: // A73
diff --git a/src/core/NEON/kernels/arm_gemm/asmlib.hpp b/src/core/NEON/kernels/arm_gemm/asmlib.hpp
index 7766656adb..4f2c47bf11 100644
--- a/src/core/NEON/kernels/arm_gemm/asmlib.hpp
+++ b/src/core/NEON/kernels/arm_gemm/asmlib.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,9 +37,6 @@
 #define ASM_PREFETCHW(address)   "PRFM PSTL1KEEP, " address "\n"
 #define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"
 
-// Lee's uarchsim hack
-//#define ASM_PREFETCH(address)	"LDNP x20, x21, " address "\n"
-
 // No preload at all
 //#define ASM_PREFETCH(address) ""
 #else
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
index 8244523696..af80c3637c 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
@@ -31,73 +31,92 @@
 #include "gemv_batched.hpp"
 #include "gemv_pretransposed.hpp"
 
+#include "kernels/a32_sgemm_8x6.hpp"
+
 #include "kernels/a64_hybrid_bf16fp32_dot_6x16.hpp"
+#include "kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp"
 #include "kernels/a64_interleaved_bf16fp32_dot_8x12.hpp"
 #include "kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp"
 #include "kernels/a64_sgemm_8x12.hpp"
-#include "kernels/a32_sgemm_8x6.hpp"
+
+#include "kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp"
 #include "kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp"
 #include "kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp"
-#include "kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<bfloat16, float> gemm_bf16_methods[] =
 {
+#ifdef __aarch64__
 #ifdef ARM_COMPUTE_ENABLE_BF16
 #ifdef ARM_COMPUTE_ENABLE_SVE
-{ // gemm_bf16_interleaved
+// gemm_bf16_interleaved
+GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_bf16fp32_mmla_8x3VL",
     [](const GemmArgs &args) { return args._ci->has_svebf16() && (args._Ksize>4); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, bfloat16, float>(args); }
-},
-{
+),
+GemmImplementation<bfloat16, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_bf16fp32_mmla_6x4VL",
+    [](const GemmArgs &args) { return args._ci->has_svebf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_bf16fp32_mmla_6x4VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_bf16fp32_mmla_6x4VL, bfloat16, float>(args); }
+),
+GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_bf16fp32_dot_6x4VL",
     [](const GemmArgs &args) { return args._ci->has_svebf16(); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && ((args._Ksize <= 128) && (args._Nsize <= 128)); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_bf16fp32_dot_6x4VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_bf16fp32_dot_6x4VL, bfloat16, float>(args); }
-},
-{ // gemm_bf16_interleaved
+),
+GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_bf16fp32_dot_8x3VL",
     [](const GemmArgs &args) { return args._ci->has_svebf16() && (args._Ksize>2); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_bf16fp32_dot_8x3VL, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_dot_8x3VL, bfloat16, float>(args); }
-},
-# endif // SVE
-{ // gemm_bf16_interleaved
+),
+#endif // ARM_COMPUTE_ENABLE_SVE
+GemmImplementation<bfloat16, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_bf16fp32_mmla_6x16",
+    [](const GemmArgs &args) { return args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_bf16fp32_mmla_6x16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_bf16fp32_mmla_6x16, bfloat16, float>(args); }
+),
+GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_interleaved_bf16fp32_mmla_8x12",
     [](const GemmArgs &args) { return args._ci->has_bf16() && (args._Ksize>4); },
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, bfloat16, float>(args); }
-},
-{
+),
+GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_bf16fp32_dot_6x16",
     [](const GemmArgs &args) { return args._ci->has_bf16(); },
-    nullptr,
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_bf16fp32_dot_6x16, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_bf16fp32_dot_6x16, bfloat16, float>(args); }
-},
-{ // gemm_bf16_interleaved
+),
+GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_interleaved_bf16fp32_dot_8x12",
     [](const GemmArgs &args) { return args._ci->has_bf16() && (args._Ksize>2); },
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_bf16fp32_dot_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
-},
-#endif // ARM_COMPUTE_ENABLE_BF16
-#ifdef __aarch64__
-{
+),
+GemmImplementation<bfloat16, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_sgemm_8x12",
     nullptr,
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, bfloat16, float>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, bfloat16, float>(args); }
-},
+),
+#endif // ARM_COMPUTE_ENABLE_BF16
 #elif defined(__arm__)
 {
     GemmMethod::GEMM_INTERLEAVED,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index b41d8dd097..01976132ed 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -23,7 +23,7 @@
  */
 
 // This can only be built if the target/compiler supports FP16 arguments.
-#ifdef __ARM_FP16_ARGS
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
 
 #include "arm_gemm.hpp"
 
@@ -43,48 +43,37 @@
 namespace arm_gemm {
 
 static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-{
+#ifdef ARM_COMPUTE_ENABLE_SVE
+GemmImplementation<__fp16, __fp16>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_fp16_mla_6x4VL",
     [](const GemmArgs &args) { return args._ci->has_sve(); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); },
-    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>(args); }
-},
-{
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp16_mla_6x4VL, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp16_mla_6x4VL, __fp16, __fp16>(args); }
+),
+GemmImplementation<__fp16, __fp16>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_fp16_mla_8x3VL",
     [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize > 4); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_fp16_mla_8x3VL, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp16_mla_8x3VL, __fp16, __fp16>(args); }
-},
-#endif
-
-#if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
+),
+#endif // ARM_COMPUTE_ENABLE_SVE
+#if defined(__aarch64__)
 GemmImplementation<__fp16, __fp16>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_fp16_mla_6x32",
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     [](const GemmArgs &args) { return args._ci->has_fp16(); },
-#else
-    nullptr,
-#endif
-    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>::estimate_cycles(args, cls_a64_hybrid_fp16_mla_6x32::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>(args); }
 ),
 GemmImplementation<__fp16, __fp16>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_hgemm_8x24",
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     [](const GemmArgs &args) { return args._ci->has_fp16(); },
-#else
-    nullptr,
-#endif
-    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16>::estimate_cycles(args, cls_a64_hgemm_8x24::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16>::estimate_cycles<__fp16>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16>(args); }
 ),
-#endif // aarch64 && FP16
-#ifdef __aarch64__
 {
     GemmMethod::GEMM_INTERLEAVED,
     "a64_sgemm_8x12",
@@ -124,4 +113,4 @@ template std::vector<KernelDescription> get_compatible_kernels<__fp16, __fp16, N
 
 } // namespace arm_gemm
 
-#endif // __ARM_FP16_ARGS
+#endif // defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index 1632e301ac..3cf84a614a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -31,17 +31,22 @@
 #include "gemv_pretransposed.hpp"
 
 #include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/a64_gemv_fp32_mla_32.hpp"
+#include "kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp"
+#include "kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp"
+#include "kernels/a64_hybrid_fp32_mla_4x24.hpp"
 #include "kernels/a64_hybrid_fp32_mla_6x16.hpp"
 #include "kernels/a64_hybrid_fp32_mla_8x4.hpp"
+#include "kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp"
 #include "kernels/a64_sgemm_8x12.hpp"
 #include "kernels/a64_sgemm_8x6.hpp"
 #include "kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp"
 #include "kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp"
 
-#include "kernels/sve_gemv_fp32_mla_8VL.hpp"
+#include "kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp"
+#include "kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp"
 #include "kernels/sve_hybrid_fp32_mla_6x4VL.hpp"
 #include "kernels/sve_hybrid_fp32_mla_8x1VL.hpp"
+#include "kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp"
 #include "kernels/sve_interleaved_fp32_mla_8x3VL.hpp"
 #include "kernels/sve_interleaved_fp32_mmla_8x3VL.hpp"
 #include "kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp"
@@ -59,57 +64,94 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
     [](const GemmArgs &args) { return new GemvBatched<float, float>(args); }
 },
 #ifdef __aarch64__
+#ifdef ARM_COMPUTE_ENABLE_BF16
+// "fast mode" (BF16) kernels
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_interleaved_bf16fp32_mmla_8x12",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_fp32bf16fp32_mmla_6x16",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_6x16, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_6x16, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_fp32bf16fp32_mmla_4x24",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_4x24, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32bf16fp32_mmla_4x24, float, float>(args); }
+),
+#endif // ARM_COMPUTE_ENABLE_BF16
 #ifdef ARM_COMPUTE_ENABLE_SVE
-{
+#ifdef ARM_COMPUTE_ENABLE_BF16
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_bf16fp32_mmla_8x3VL",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_svebf16(); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
-    "sve_gemv_fp32_mla_8VL",
-    [](const GemmArgs &args) { return args._ci->has_sve() && args._Msize==1 && args._nbatches==1 && !args._indirect_input; },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args) { return new GemvPretransposed<cls_sve_gemv_fp32_mla_8VL, float, float>(args); }
-},
-#endif
-{
+    "sve_hybrid_fp32bf16fp32_mmla_6x4VL",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
-    "a64_gemv_fp32_mla_32",
-    [](const GemmArgs &args) { return args._Msize==1 && args._nbatches==1 && !args._indirect_input; },
-    nullptr,
-    [](const GemmArgs &args) { return new GemvPretransposed<cls_a64_gemv_fp32_mla_32, float, float>(args); }
-},
-
-// MMLA next due to higher throughput (SVE only)
-#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVEF32MM)
+    "sve_hybrid_fp32bf16fp32_mmla_4x6VL",
+    [](const GemmArgs &args) { return args._fast_mode && args._ci->has_bf16(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL, float, float>(args); }
+),
+#endif // ARM_COMPUTE_ENABLE_BF16
+#ifdef ARM_COMPUTE_ENABLE_SVEF32MM
+// MMLA next due to higher throughput (which is SVE only)
+// Prefer this in all cases, except if fast mode is requested and BF16 is available.
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_fp32_mmla_8x3VL",
     [](const GemmArgs &args) { return args._ci->has_svef32mm() && (args._Ksize>4); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args) { return !(args._fast_mode && args._ci->has_bf16()); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mmla_8x3VL, float, float>(args); }
 },
-#endif // ARM_COMPUTE_ENABLE_SVE && ARM_COMPUTE_ENABLE_SVEF32MM
-
-#ifdef ARM_COMPUTE_ENABLE_SVE
-// SVE smallk / hybrid methods
+#endif // ARM_COMPUTE_ENABLE_SVEF32MM
+// SVE kernels
 {
     GemmMethod::GEMM_HYBRID,
     "sve_smallK_hybrid_fp32_mla_8x1VL",
     [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize <= 24 && !args._indirect_input; },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_fp32_mla_8x1VL, float, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_fp32_mla_8x1VL",
     [](const GemmArgs &args) { return args._ci->has_sve(); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (args._Nsize < 12); },
+    [](const GemmArgs &args) { return (args._Nsize < 12); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_8x1VL, float, float>(args); }
 },
-{
+GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_fp32_mla_6x4VL",
     [](const GemmArgs &args) { return args._ci->has_sve(); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float>::estimate_cycles<float>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float>(args); }
-},
+),
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_interleaved_fp32_mla_8x3VL",
+    [](const GemmArgs &args) { return args._ci->has_sve(); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>(args); }
+),
 #endif // ARM_COMPUTE_ENABLE_SVE
 // Cortex-A35 specific kernel - use for any problem on A35, and never in any other cases.
 {
@@ -143,25 +185,23 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
 },
 GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_fp32_mla_4x24",
+    nullptr,
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_4x24, float, float>::estimate_cycles<float>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_4x24, float, float>(args); }
+),
+GemmImplementation<float, float>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
     "a64_hybrid_fp32_mla_6x16",
     nullptr,
-    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>::estimate_cycles(args, cls_a64_hybrid_fp32_mla_6x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>::estimate_cycles<float>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>(args); }
 ),
-#ifdef ARM_COMPUTE_ENABLE_SVE
-{
-    GemmMethod::GEMM_INTERLEAVED,
-    "sve_interleaved_fp32_mla_8x3VL",
-    [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
-    [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>(args); }
-},
-#endif // ARM_COMPUTE_ENABLE_SVE
 GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_sgemm_8x12",
     nullptr,
-    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, float, float>::estimate_cycles(args, cls_a64_sgemm_8x12::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, float, float>::estimate_cycles<float>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, float, float>(args); }
 ),
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index d702cffce1..436316c0f7 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,7 +74,7 @@ class GemmHybrid : public GemmCommon<To, Tr> {
         }
 
         if (args._cfg && args._cfg->inner_block_size) {
-            return args._cfg->inner_block_size;
+            return roundup(args._cfg->inner_block_size, strategy::k_unroll());
         }
 
         // Target block size (512 for FP32, scaling for other types).  Don't block until size reaches 1.5X this.
@@ -97,7 +97,13 @@ class GemmHybrid : public GemmCommon<To, Tr> {
     // single block.
     static unsigned int compute_n_block(const GemmArgs &args) {
         if (args._cfg && args._cfg->outer_block_size) {
-            return args._cfg->outer_block_size;
+            unsigned int n_block = args._cfg->outer_block_size;
+
+            // Needs to be (at least a single) multiple of the kernel output width.
+            n_block /= strategy::out_width();
+            n_block = std::max(n_block, 1u) * strategy::out_width();
+
+            return n_block;
         }
 
         if (args._Nsize <= 64) {
@@ -264,6 +270,17 @@ public:
 
         return total_cycles;
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c;
+
+        c.method = GemmMethod::GEMM_HYBRID;
+        c.inner_block_size = _k_block;
+        c.outer_block_size = _n_block;
+        c.filter = get_type_name<strategy>();
+
+        return c;
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
index 41fecc6bec..5cbdf20798 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,31 +55,31 @@ namespace {
 template<typename OutputStage, bool SeparateQuantize = false>
 class run_hybrid_kernel {
 public:
-    template<typename strategy, typename To, typename Tr>
-    static void run (
+    template<typename strategy, typename Tlo, typename Tro, typename Tr>
+    static inline void run (
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
-        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
         const OutputStage &os, const int32_t *col_bias, unsigned int n_0 );
 };
 
 template<>
-template<typename strategy, typename To, typename Tr>
-void run_hybrid_kernel<Nothing, false>::run(
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Nothing, false>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
-        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
         const Nothing &, const int32_t *, unsigned int) {
 #ifdef CYCLE_PROFILING
     auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
 #endif
     UNUSED(kern_k);
 
-    /* Indirect hybrid kernels read the full width of the bias. So we need to detect the case where we are writing
+    /* Indirect hybrid kernels read the full width of the bias.  So we need to detect the case where we are writing
      * a partial block and pad the bias for that block. */
     if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) {
         /* Break N into "N_bulk" (a multiple of output width) and "N_remainder" */
@@ -112,13 +112,13 @@ void run_hybrid_kernel<Nothing, false>::run(
 }
 
 template<>
-template<typename strategy, typename To, typename Tr>
-void run_hybrid_kernel<Requantize32, false>::run(
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Requantize32, false>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
-        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
         const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
 #ifdef CYCLE_PROFILING
     auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
@@ -129,13 +129,13 @@ void run_hybrid_kernel<Requantize32, false>::run(
 }
 
 template<>
-template<typename strategy, typename To, typename Tr>
-void run_hybrid_kernel<Requantize32, true>::run(
+template<typename strategy, typename Tlo, typename Tro, typename Tr>
+inline void run_hybrid_kernel<Requantize32, true>::run(
 #ifdef CYCLE_PROFILING
         profiler &prof,
 #endif
-        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
-        unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+        const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg, unsigned int M, unsigned int N,
+        unsigned int kern_k, const Tro *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
         const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
     UNUSED(kern_k);
     // On this route we will only process one kernel height at a time and will make sure this happens in the driver loop.
@@ -183,7 +183,8 @@ void run_hybrid_kernel<Requantize32, true>::run(
 // Implementation of the GemmCommon abstract class.
 template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false>
 class GemmHybridIndirect : public GemmCommon<To, Tr> {
-    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::lhs_operand_type Tloi;
+    typedef typename strategy::rhs_operand_type Troi;
     typedef typename strategy::result_type Tri;
 
     GemmArgs           _args;
@@ -201,7 +202,7 @@ class GemmHybridIndirect : public GemmCommon<To, Tr> {
     const unsigned int _Mround;
 
     /* Pretransposed buffer. */
-    const Toi *_B_transposed=nullptr;
+    const Troi *_B_transposed=nullptr;
 
     /* Indirect parameters.  _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
     const To * const * const * _indirect_buf = nullptr;
@@ -233,7 +234,7 @@ class GemmHybridIndirect : public GemmCommon<To, Tr> {
         }
 
         if (args._cfg && args._cfg->inner_block_size) {
-            return args._cfg->inner_block_size;
+            return roundup(args._cfg->inner_block_size, strategy::k_unroll());
         }
 
         // Experimental data suggests an optimal block size of 512 for FP32 (scaling accordingly for other
@@ -356,8 +357,8 @@ public:
 
         // In convolution mode, we need input pointers.
         if (_convolver) {
-            in_row_ptrs.resize(strategy::out_height() * _args._Ksections, nullptr);
-            in_row_strings.resize(_args._Ksections, nullptr);
+            in_row_ptrs = std::vector<const To *>(strategy::out_height() * _args._Ksections, nullptr);
+            in_row_strings = std::vector<const To * const *>(_args._Ksections, nullptr);
 
             for (unsigned int i=0; i<_args._Ksections; i++) {
                 in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]);
@@ -371,7 +372,7 @@ public:
 
         /* Make sure we've been set up correctly. */
         assert(_B_transposed);
-        static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+        static_assert(std::is_same<To, Tloi>::value, "gemm_native: Operand types must be the same.");
 //        static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
 
         /* For now, each work item implies all the K for a given output
@@ -422,7 +423,7 @@ public:
                 const unsigned int nmax    = std::min(n0 + _n_block, _args._Nsize);
                 const unsigned int multi   = p.dim(3);
 
-                const Toi *b_panel = _B_transposed +
+                const Troi *b_panel = _B_transposed +
                                      (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
                                      (k0 * roundup(_args._Nsize, strategy::out_width())) +
                                      (n0 * kern_k);
@@ -510,7 +511,7 @@ public:
 
     size_t get_B_pretransposed_array_size() const override {
         // Start with actual pretransposed buffer...
-        size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Toi);
+        size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Troi);
 
         // Space for result row pointers (not strictly needed any more but retained for indirect output testing)
         size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *);
@@ -536,7 +537,7 @@ public:
 
         // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
-        Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        Troi *buffer = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
         _B_transposed = buffer;
 
         strategy strat(_args._ci);
@@ -548,47 +549,55 @@ public:
                 /* Figure out the size of each block. */
                 unsigned int k_size = kmax - k0;
 
-                // We need to insert padding at the end of each K section.
-                // The computation needed is a little delicate - the coordinates from the block walker are expressed in
-                // terms of the full, padded, _Ktotal.
-                // But we need to transform each section with reference to the original, unpadded, input, letting the
-                // transform pad each section as needed.
+                if (_args._Ksections > 1) {
+                    // We need to insert padding at the end of each K section.
+                    // The computation needed is a little delicate - the coordinates from the block walker are expressed in
+                    // terms of the full, padded, _Ktotal.
+                    // But we need to transform each section with reference to the original, unpadded, input, letting the
+                    // transform pad each section as needed.
 
-                // This is needed for computations below.
-                const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
+                    // This is needed for computations below.
+                    const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
 
-                // The expected output format is also an entire <out_width> columns interleaved, then the next set of
-                // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
-                // a time.
-                for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){
-                    unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
+                    // The expected output format is also an entire <out_width> columns interleaved, then the next set of
+                    // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
+                    // a time.
+                    for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){
+                        unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
 
-                    // Track where we are and how much work is left.
-                    unsigned int kpos  = k0;
-                    unsigned int kleft = k_size;
+                        // Track where we are and how much work is left.
+                        unsigned int kpos  = k0;
+                        unsigned int kleft = k_size;
 
-                    while (kleft) {
-                        // Which section are we in?  Based on the rounded-up section size.
-                        unsigned int k_section_base = kpos / rounded_section_size;
-                        // How far into the section are we?
-                        unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
+                        while (kleft) {
+                            // Which section are we in?  Based on the rounded-up section size.
+                            unsigned int k_section_base = kpos / rounded_section_size;
+                            // How far into the section are we?
+                            unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
 
-                        // We will either copy the rest of this section, or to the end of the requested length.
-                        unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
+                            // We will either copy the rest of this section, or to the end of the requested length.
+                            unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
 
-                        strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
-                                                  x0, xmax,
-                                                  (k_section_base * _args._Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
-                                                  (k_section_base * _args._Ksize) + k_offset + k_length);   // K end point - starting point plus length computed above.
+                            strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
+                                                      x0, xmax,
+                                                      (k_section_base * _args._Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
+                                                      (k_section_base * _args._Ksize) + k_offset + k_length);   // K end point - starting point plus length computed above.
 
-                        // We need to modify our position based on the ROUNDED version of what we just did.
-                        unsigned int padded_length = roundup(k_length, strategy::k_unroll());
+                            // We need to modify our position based on the ROUNDED version of what we just did.
+                            unsigned int padded_length = roundup(k_length, strategy::k_unroll());
 
-                        buffer += strategy::out_width() * padded_length;
+                            buffer += strategy::out_width() * padded_length;
 
-                        kpos  += padded_length;
-                        kleft -= padded_length;
+                            kpos  += padded_length;
+                            kleft -= padded_length;
+                        }
                     }
+                } else {
+                // In the single K section case, can process the whole lot in one go.
+                // Caution: 'blockwalker::kmax()' rounds up, so clamp to valid _Ksize.
+                    strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
+                                              0, _args._Nsize, k0, std::min(kmax, _args._Ksize));
+                    buffer += roundup(_args._Nsize, strategy::out_width()) * roundup(kmax-k0, strategy::k_unroll());
                 }
             }
         }
@@ -597,12 +606,17 @@ public:
     void set_pretransposed_B_data(void *in_buffer) override {
         // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
-        _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+        _B_transposed = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
         _col_bias = reinterpret_cast<int32_t *>(in_buffer);
     }
 
-    // Estimate cycles for given problem given provided parameters
-    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params, const OutputStage &os = {} ) {
+    // Estimate cycles for given problem given provided parameters.
+    // "perf_type" is a type to pass along to get_performance_parameters to get the right set of performance
+    // parameters - it's arbitrary but usually either the input or output type.
+    template <typename perf_type>
+    static uint64_t estimate_cycles(const GemmArgs &args, const OutputStage &os = {}) {
+        const PerformanceParameters params = strategy::template get_performance_parameters<perf_type>(args._ci);
+
         // Note: Current hybrid kernels don't actually round up height (they
         // have paths for each possible height).  Might need to make this
         // configurable in future.
@@ -666,6 +680,17 @@ public:
         assert(parms.input_channels == _args._Ksize);
         _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c;
+
+        c.method = GemmMethod::GEMM_HYBRID;
+        c.inner_block_size = _k_block;
+        c.outer_block_size = _n_block;
+        c.filter = get_type_name<strategy>();
+
+        return c;
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
index e48d9b9a07..c72dca2e96 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,11 +81,42 @@ class GemmHybridQuantized : public GemmCommon<To, Tr> {
     static unsigned int compute_k_block(const GemmArgs &args) {
         // We don't support K blocks as we only temporarily store 32 bit results.
         return args._Ksize;
+
+        if (args._cfg && args._cfg->inner_block_size) {
+            return args._cfg->inner_block_size;
+        }
+
+        const unsigned int L1_size = args._ci->get_L1_cache_size();
+
+        // k_block: Find out how much of the larger array can be loaded into half the cache.
+        // This should account for associative caches.
+        unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+        // Needs to be (at least a single) multiple of the K unroll level.
+        k_block /= strategy::k_unroll();
+        k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+        // Now tune to presented problem size; this is how many blocks we need.
+        unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
+
+        // So divide the space equally into that many blocks.
+        k_block = iceildiv(args._Ksize, numk_blocks);
+
+        // And round UP to the K unroll level required.
+        k_block = roundup(k_block, strategy::k_unroll());
+
+        return k_block;
     }
 
     static unsigned int compute_n_block(const GemmArgs &args) {
         if (args._cfg && args._cfg->outer_block_size) {
-            return args._cfg->outer_block_size;
+            unsigned int n_block = args._cfg->outer_block_size;
+
+            // Needs to be (at least a single) multiple of the kernel output width.
+            n_block /= strategy::out_width();
+            n_block = std::max(n_block, 1u) * strategy::out_width();
+
+            return n_block;
         }
 
         const unsigned int k_block = compute_k_block(args);
@@ -279,6 +310,17 @@ public:
         _qp.bias = bias;
         _qp.bias_multi_stride = bias_multi_stride;
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c;
+
+        c.method = GemmMethod::GEMM_HYBRID;
+        c.inner_block_size = _k_block;
+        c.outer_block_size = _n_block;
+        c.filter = get_type_name<strategy>();
+
+        return c;
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index bfb3ca901f..cfbf66d60f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -34,11 +34,13 @@
 #include "kernels/a64_gemm_s8_8x12.hpp"
 #include "kernels/a64_gemm_s8_4x4.hpp"
 #include "kernels/a64_hybrid_s8s32_dot_6x16.hpp"
+#include "kernels/a64_hybrid_s8s32_mmla_6x16.hpp"
 #include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp"
 #include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp"
 #include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp"
 
 #include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp"
 #include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp"
 #include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp"
 #include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp"
@@ -47,46 +49,56 @@ namespace arm_gemm {
 
 static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
 #ifdef ARM_COMPUTE_ENABLE_SVE
-#ifdef ARM_COMPUTE_ENABLE_I8MM
-{
+GemmImplementation<int8_t, int32_t>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_s8s32_mmla_6x4VL",
+    [](const GemmArgs &args) { return args._ci->has_svei8mm(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int32_t>(args); }
+),
+GemmImplementation<int8_t, int32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_s8s32_mmla_8x3VL",
     [](const GemmArgs &args) { return args._ci->has_svei8mm() && (args._Ksize>8); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int32_t>(args); }
-},
-#endif // ARM_COMPUTE_ENABLE_I8MM
+),
 {
     GemmMethod::GEMM_HYBRID,
     "sve_smallK_hybrid_s8s32_dot_8x1VL",
-    [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args) { return args._ci->has_svei8mm() && args._Ksize<=64 && !args._indirect_input; },
+    nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_s8s32_dot_8x1VL, int8_t, int32_t>(args); }
 },
-{
+GemmImplementation<int8_t, int32_t>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_s8s32_dot_6x4VL",
     [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize>=16; },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int32_t>(args); }
-},
-{
+),
+GemmImplementation<int8_t, int32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_s8s32_dot_8x3VL",
     [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int32_t>(args); }
-},
-#endif // SVE
-#ifdef ARM_COMPUTE_ENABLE_I8MM
-{
+),
+#endif // ARM_COMPUTE_ENABLE_SVE
+GemmImplementation<int8_t, int32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_interleaved_s8s32_mmla_8x12",
-    [](const GemmArgs &args) { return args._ci->has_svei8mm() && (args._Ksize>8); },
-    nullptr,
+    [](const GemmArgs &args) { return args._ci->has_i8mm() && (args._Ksize>8); },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int32_t>(args); }
-},
-#endif // ARM_COMPUTE_ENABLE_I8MM
+),
+GemmImplementation<int8_t, int32_t>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8s32_mmla_6x16",
+    [](const GemmArgs &args) { return args._ci->has_i8mm(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int32_t>(args); }
+),
 {
     GemmMethod::GEMM_HYBRID,
     "a64_smallK_hybrid_s8s32_dot_8x4",
@@ -108,27 +120,29 @@ static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
     [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && ((args._Msize > 28) || ((args._Msize % 8) > 4)); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s16_8x12, int8_t, int32_t>(args); },
 },
-{
+GemmImplementation<int8_t, int32_t>::with_estimate(
+
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_s8s32_dot_6x16",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int32_t>(args); }
-},
-{
+),
+GemmImplementation<int8_t, int32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_s8_8x12",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_s8_8x12, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_8x12, int8_t, int32_t>(args); }
-},
-{
+),
+GemmImplementation<int8_t, int32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_s8_4x4",
     nullptr,
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_s8_4x4, int8_t, int32_t>::estimate_cycles<int32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_4x4, int8_t, int32_t>(args); }
-},
+),
+
 {
     GemmMethod::DEFAULT,
     "",
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 7f870b83d7..5639cb4182 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -192,7 +192,7 @@ void kernel_and_merge<true, Requantize32>::run(
 
     {
 #ifdef CYCLE_PROFILING
-        auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
+        auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() * sizeof(Tr)));
 #endif
         // The interleaved kernel outputs in blocks - each block is a
         // row-major matrix of size out_width * out_height.  The merge
@@ -496,7 +496,7 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
 
     static unsigned int get_k_block_size(const GemmArgs &args) {
         if (args._cfg && args._cfg->inner_block_size) {
-            return args._cfg->inner_block_size;
+            return roundup(args._cfg->inner_block_size, strategy::k_unroll());
         }
 
         // K blocking not supported if we are requantizing.
@@ -947,47 +947,55 @@ public:
             /* Figure out the size of each block. */
             unsigned int k_size = (current.kmax() - current.k0());
 
-            // We need to insert padding at the end of each K section.
-            // The computation needed is a little delicate - the coordinates from the block walker are expressed in
-            // terms of the full, padded, _Ktotal.
-            // But we need to transform each section with reference to the original, unpadded, input, letting the
-            // transform pad each section as needed.
+            if (_Ksections > 1) {
+                // We need to insert padding at the end of each K section.
+                // The computation needed is a little delicate - the coordinates from the block walker are expressed in
+                // terms of the full, padded, _Ktotal.
+                // But we need to transform each section with reference to the original, unpadded, input, letting the
+                // transform pad each section as needed.
 
-            // This is needed for computations below.
-            const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll());
+                // This is needed for computations below.
+                const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll());
 
-            // The expected output format is also an entire <out_width> columns interleaved, then the next set of
-            // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
-            // a time.
-            for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ){
-                unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax());
+                // The expected output format is also an entire <out_width> columns interleaved, then the next set of
+                // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
+                // a time.
+                for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ) {
+                    unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax());
 
-                // Track where we are and how much work is left.
-                unsigned int kpos  = current.k0();
-                unsigned int kleft = k_size;
+                    // Track where we are and how much work is left.
+                    unsigned int kpos  = current.k0();
+                    unsigned int kleft = k_size;
 
-                while (kleft) {
-                    // Which section are we in?  Based on the rounded-up section size.
-                    unsigned int k_section_base = kpos / rounded_section_size;
-                    // How far into the section are we?
-                    unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
+                    while (kleft) {
+                        // Which section are we in?  Based on the rounded-up section size.
+                        unsigned int k_section_base = kpos / rounded_section_size;
+                        // How far into the section are we?
+                        unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
 
-                    // We will either copy the rest of this section, or to the end of the requested length.
-                    unsigned int k_length = std::min(_Ksize - k_offset, kleft);
+                        // We will either copy the rest of this section, or to the end of the requested length.
+                        unsigned int k_length = std::min(_Ksize - k_offset, kleft);
 
-                    strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
-                                              x0, xmax,
-                                              (k_section_base * _Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
-                                              (k_section_base * _Ksize) + k_offset + k_length);   // K end point - starting point plus length computed above.
+                        strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
+                                                  x0, xmax,
+                                                  (k_section_base * _Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
+                                                  (k_section_base * _Ksize) + k_offset + k_length);   // K end point - starting point plus length computed above.
 
-                    // We need to modify our position based on the ROUNDED version of what we just did.
-                    unsigned int padded_length = roundup(k_length, strategy::k_unroll());
+                        // We need to modify our position based on the ROUNDED version of what we just did.
+                        unsigned int padded_length = roundup(k_length, strategy::k_unroll());
 
-                    buffer += strategy::out_width() * padded_length;
+                        buffer += strategy::out_width() * padded_length;
 
-                    kpos  += padded_length;
-                    kleft -= padded_length;
+                        kpos  += padded_length;
+                        kleft -= padded_length;
+                    }
                 }
+            } else {
+                // In the single K section case, can process the whole lot in one go.
+                // Caution: 'blockwalker::kmax()' rounds up, so clamp to valid _Ksize.
+                strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
+                                          current.x0(), current.xmax(), current.k0(), std::min(current.kmax(), _Ksize));
+                buffer += roundup(current.xmax() - current.x0(), strategy::out_width()) * roundup(current.kmax() - current.k0(), strategy::k_unroll());
             }
         } while (current.advance());
     }
@@ -1019,12 +1027,15 @@ public:
     }
 
     // Estimate cycles for given problem given provided parameters
-    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+    template<typename perf_type>
+    static uint64_t estimate_cycles(const GemmArgs &args) {
         unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
 
+        const PerformanceParameters &params = strategy::template get_performance_parameters<perf_type>(args._ci);
+
         uint64_t total_macs    = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * get_ktotal(args);
         uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * get_ktotal(args) * sizeof(Toi);
-        uint64_t merge_bytes   = static_cast<uint16_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
+        uint64_t merge_bytes   = static_cast<uint16_t>(args._nbatches) * args._nmulti * k_blocks * args._Msize * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
 
         float mac_cycles     = static_cast<float>(total_macs) / params.kernel_macs_cycle;
         float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
@@ -1042,6 +1053,17 @@ public:
 
         return static_cast<uint64_t>(total_cycles);
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c;
+
+        c.method = GemmMethod::GEMM_INTERLEAVED;
+        c.inner_block_size = _k_block;
+        c.outer_block_size = _x_block;
+        c.filter = get_type_name<strategy>();
+
+        return c;
+    }
 };
 
 // Aliases for the variations
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
index 985567f6f3..aa62815438 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
@@ -29,15 +29,21 @@
 #include "kernels/a64_gemm_s8_4x4.hpp"
 #include "kernels/a64_gemm_s8_8x12.hpp"
 #include "kernels/a64_hybrid_s8qa_dot_4x16.hpp"
+#include "kernels/a64_hybrid_s8qa_mmla_4x16.hpp"
 #include "kernels/a64_hybrid_s8qs_dot_6x16.hpp"
+#include "kernels/a64_hybrid_s8qs_mmla_6x16.hpp"
 #include "kernels/a64_hybrid_s8s32_dot_6x16.hpp"
+#include "kernels/a64_hybrid_s8s32_mmla_6x16.hpp"
 #include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp"
 #include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp"
 #include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp"
 
-#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp"
 #include "kernels/sve_hybrid_s8qa_dot_4x4VL.hpp"
+#include "kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp"
 #include "kernels/sve_hybrid_s8qs_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp"
+#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp"
 #include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp"
 #include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp"
 #include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp"
@@ -54,62 +60,98 @@ namespace arm_gemm {
 static const GemmImplementation<int8_t, int8_t, Requantize32> gemm_qint8_methods[] =
 {
 #ifdef ARM_COMPUTE_ENABLE_SVE
-#ifdef ARM_COMPUTE_ENABLE_I8MM
-{
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_s8qa_mmla_4x4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp) && args._ci->has_sve2() && args._ci->has_svei8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qa_mmla_4x4VL, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qa_mmla_4x4VL, int8_t, int8_t, Requantize32>(args, qp); }
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_s8qs_mmla_6x4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_symmetric(qp) && args._ci->has_sve2() && args._ci->has_svei8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qs_mmla_6x4VL, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qs_mmla_6x4VL, int8_t, int8_t, Requantize32>(args, qp); }
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_s8s32_mmla_8x3VL",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm() && (args._Ksize>8); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t>(args, qp); }
-},
-#endif // ARM_COMPUTE_ENABLE_I8MM
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_hybrid_s8s32_mmla_6x4VL",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_mmla_6x4VL, int8_t, int8_t, Requantize32, true>(args, qp); }
+),
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "sve_smallK_hybrid_s8s32_dot_8x1VL",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_sve_smallK_hybrid_s8s32_dot_8x1VL, int8_t, int8_t>(args, qp); }
 },
-#ifdef ARM_COMPUTE_ENABLE_SVE2
-{
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_s8qs_dot_6x4VL",
     [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_symmetric(qp); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qs_dot_6x4VL, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qs_dot_6x4VL, int8_t, int8_t, Requantize32>(args, qp); }
-},
-{
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_s8qa_dot_4x4VL",
-    [](const GemmArgs &args, const Requantize32 &qp) { return  args._ci->has_sve2() && quant_hybrid_asymmetric(qp); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_asymmetric(qp); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8qa_dot_4x4VL, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qa_dot_4x4VL, int8_t, int8_t, Requantize32>(args, qp); }
-},
-#endif // ARM_COMPUTE_ENABLE_SVE2
-{
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_s8s32_dot_6x4VL",
-    [](const GemmArgs &args, const Requantize32 &) { return  args._ci->has_sve(); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, Requantize32, true>(args, qp); }
-},
-{
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_s8s32_dot_8x3VL",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>4); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t>(args, qp); }
-},
-#endif // SVE
-#ifdef ARM_COMPUTE_ENABLE_I8MM
-{
+),
+#endif // ARM_COMPUTE_ENABLE_SVE
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8qa_mmla_4x16",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_i8mm() && quant_hybrid_asymmetric(qp); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qa_mmla_4x16, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qa_mmla_4x16, int8_t, int8_t, Requantize32>(args, qp); }
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_s8qs_mmla_6x16",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_i8mm() && quant_hybrid_symmetric(qp); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qs_mmla_6x16, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qs_mmla_6x16, int8_t, int8_t, Requantize32>(args, qp); }
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_interleaved_s8s32_mmla_8x12",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm() && (args._Ksize>8); },
-    nullptr,
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t>(args, qp); }
-},
-#endif // ARM_COMPUTE_ENABLE_I8MM
+),
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_hybrid_s8s32_mmla_6x16",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_mmla_6x16, int8_t, int8_t, Requantize32, true>(args, qp); }
+),
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "a64_smallK_hybrid_s8s32_dot_8x4",
@@ -135,42 +177,42 @@ GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_s8qs_dot_6x16",
     [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_symmetric(qp); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qs_dot_6x16, int8_t, int8_t, Requantize32>::estimate_cycles(args, cls_a64_hybrid_s8qs_dot_6x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qs_dot_6x16, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qs_dot_6x16, int8_t, int8_t, Requantize32>(args, qp); }
 ),
 GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_s8qa_dot_4x16",
     [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qa_dot_4x16, int8_t, int8_t, Requantize32>::estimate_cycles(args, cls_a64_hybrid_s8qa_dot_4x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8qa_dot_4x16, int8_t, int8_t, Requantize32>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qa_dot_4x16, int8_t, int8_t, Requantize32>(args, qp); }
 ),
 GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_s8s32_dot_6x16",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, Requantize32, true>::estimate_cycles(args, cls_a64_hybrid_s8s32_dot_6x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, Requantize32, true>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, Requantize32, true>(args, qp); }
 ),
 GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_s8_8x12",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_s8_8x12, int8_t, int8_t>::estimate_cycles(args, cls_a64_gemm_s8_8x12::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_s8_8x12, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_8x12, int8_t, int8_t>(args, qp); }
 ),
-{
+GemmImplementation<int8_t, int8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_s8_4x4",
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &) { return !args._ci->has_dotprod(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_s8_4x4, int8_t, int8_t>::estimate_cycles<int8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_4x4, int8_t, int8_t>(args, qp); }
-},
+),
 {
     GemmMethod::QUANTIZE_WRAPPER,
     "quantized_wrapper",
     [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; },
-    [](const GemmArgs &args, const Requantize32 &) { return !args._ci->has_dotprod(); },
+    [](const GemmArgs &, const Requantize32 &) { return false; },
     [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<int8_t, int8_t, int32_t>(args, qp); }
 },
 {
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
index f3f2f335fd..abd2799583 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
@@ -29,13 +29,17 @@
 #include "kernels/a64_gemm_u8_4x4.hpp"
 #include "kernels/a64_gemm_u8_8x12.hpp"
 #include "kernels/a64_hybrid_u8qa_dot_4x16.hpp"
+#include "kernels/a64_hybrid_u8qa_mmla_4x16.hpp"
 #include "kernels/a64_hybrid_u8u32_dot_6x16.hpp"
+#include "kernels/a64_hybrid_u8u32_mmla_6x16.hpp"
 #include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp"
 #include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp"
 #include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp"
 
-#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp"
 #include "kernels/sve_hybrid_u8qa_dot_4x4VL.hpp"
+#include "kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp"
+#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp"
 #include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp"
 #include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp"
 #include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp"
@@ -51,55 +55,77 @@ namespace arm_gemm {
 static const GemmImplementation<uint8_t, uint8_t, Requantize32> gemm_quint8_methods[] =
 {
 #ifdef ARM_COMPUTE_ENABLE_SVE
-#ifdef ARM_COMPUTE_ENABLE_I8MM
-{
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_u8qa_mmla_4x4VL",
+    [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp) && args._ci->has_sve2() && args._ci->has_svei8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8qa_mmla_4x4VL, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8qa_mmla_4x4VL, uint8_t, uint8_t, Requantize32>(args, qp); }
+),
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_u8u32_mmla_8x3VL",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm() && (args._Ksize>8); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t>(args, qp); }
-},
-#endif
+),
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "sve_hybrid_u8u32_mmla_6x4VL",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_svei8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+),
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "sve_smallK_hybrid_u8u32_dot_8x1VL",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_sve_smallK_hybrid_u8u32_dot_8x1VL, uint8_t, uint8_t>(args, qp); }
 },
-#ifdef ARM_COMPUTE_ENABLE_SVE2 // Requantizing kernels include some SVE2 only instructions (SQRDMULH, SRSHL)
-{
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_u8qa_dot_4x4VL",
-    [](const GemmArgs &args, const Requantize32 &qp) { return  args._ci->has_sve2() && quant_hybrid_asymmetric(qp); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve2() && quant_hybrid_asymmetric(qp); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8qa_dot_4x4VL, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8qa_dot_4x4VL, uint8_t, uint8_t, Requantize32>(args, qp); }
-},
-#endif // ARM_COMPUTE_ENABLE_SVE2
-{
+),
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_u8u32_dot_6x4VL",
-    [](const GemmArgs &args, const Requantize32 &) { return  args._ci->has_sve(); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, Requantize32, true>(args, qp); }
-},
-{
+),
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_u8u32_dot_8x3VL",
-    [](const GemmArgs &args, const Requantize32 &) { return  args._ci->has_sve() && (args._Ksize>4); },
-    [](const GemmArgs &args, const Requantize32 &) { return  args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>4); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t>(args, qp); }
-},
-#endif
-#ifdef ARM_COMPUTE_ENABLE_I8MM
-{
+),
+#endif // ARM_COMPUTE_ENABLE_SVE
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_u8qa_mmla_4x16",
+    [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_i8mm() && quant_hybrid_asymmetric(qp); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8qa_mmla_4x16, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8qa_mmla_4x16, uint8_t, uint8_t, Requantize32>(args, qp); }
+),
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_interleaved_u8u32_mmla_8x12",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm() && (args._Ksize>8); },
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t>(args, qp); }
-},
-#endif
+),
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
+    GemmMethod::GEMM_INTERLEAVED,
+    "a64_hybrid_u8u32_mmla_6x16",
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_i8mm(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
+    [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+),
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "a64_smallK_hybrid_u8u32_dot_8x4",
@@ -125,35 +151,35 @@ GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_u8qa_dot_4x16",
     [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8qa_dot_4x16, int8_t, int8_t, Requantize32>::estimate_cycles(args, cls_a64_hybrid_u8qa_dot_4x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8qa_dot_4x16, uint8_t, uint8_t, Requantize32>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8qa_dot_4x16, uint8_t, uint8_t, Requantize32>(args, qp); }
 ),
 GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_u8u32_dot_6x16",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, int8_t, int8_t, Requantize32, true>::estimate_cycles(args, cls_a64_hybrid_u8u32_dot_6x16::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint8_t, Requantize32, true>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint8_t, Requantize32, true>(args, qp); }
 ),
 GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_u8_8x12",
     [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_u8_8x12, int8_t, int8_t>::estimate_cycles(args, cls_a64_gemm_u8_8x12::get_performance_parameters(args._ci)); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_u8_8x12, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_8x12, uint8_t, uint8_t>(args, qp); }
 ),
-{
+GemmImplementation<uint8_t, uint8_t, Requantize32>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_u8_4x4",
     nullptr,
-    [](const GemmArgs &args, const Requantize32 &) { return !args._ci->has_dotprod(); },
+    [](const GemmArgs &args, const Requantize32 &) { return GemmInterleavedQuantized<cls_a64_gemm_u8_4x4, uint8_t, uint8_t>::estimate_cycles<uint8_t>(args); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_4x4, uint8_t, uint8_t>(args, qp); }
-},
+),
 {
     GemmMethod::QUANTIZE_WRAPPER,
     "quantized_wrapper",
     [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; },
-    [](const GemmArgs &args, const Requantize32 &) { return !args._ci->has_dotprod(); },
+    [](const GemmArgs &, const Requantize32 &) { return false; },
     [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<uint8_t, uint8_t, uint32_t>(args, qp); }
 },
 {
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index 4c05fd1b73..75d6b362cc 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -34,11 +34,13 @@
 #include "kernels/a64_gemm_u8_4x4.hpp"
 #include "kernels/a64_gemm_u8_8x12.hpp"
 #include "kernels/a64_hybrid_u8u32_dot_6x16.hpp"
+#include "kernels/a64_hybrid_u8u32_mmla_6x16.hpp"
 #include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp"
 #include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp"
 #include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp"
 
 #include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp"
 #include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp"
 #include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp"
 #include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp"
@@ -47,46 +49,56 @@ namespace arm_gemm {
 
 static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
 #ifdef ARM_COMPUTE_ENABLE_SVE
-#ifdef ARM_COMPUTE_ENABLE_I8MM
-{
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "sve_hybrid_u8u32_mmla_6x4VL",
+    [](const GemmArgs &args) { return args._ci->has_svei8mm(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_mmla_6x4VL, uint8_t, uint32_t>(args); }
+),
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_u8u32_mmla_8x3VL",
     [](const GemmArgs &args) { return args._ci->has_svei8mm() && (args._Ksize>8); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint32_t>(args); }
-},
-#endif
+),
 {
     GemmMethod::GEMM_HYBRID,
-    "smallK_hybrid_u8u32_dot_8x1VL",
+    "sve_smallK_hybrid_u8u32_dot_8x1VL",
     [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; },
+    nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_u8u32_dot_8x1VL, uint8_t, uint32_t>(args); }
 },
-{
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "sve_hybrid_u8u32_dot_6x4VL",
     [](const GemmArgs &args) { return args._ci->has_sve(); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint32_t>(args); }
-},
-{
+),
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_interleaved_u8u32_dot_8x3VL",
     [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); },
-    [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, 
+    [](const GemmArgs &args) { return GemmInterleaved<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint32_t>(args); }
-},
-#endif
-#ifdef ARM_COMPUTE_ENABLE_I8MM
-{
+),
+#endif // ARM_COMPUTE_ENABLE_SVE
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_interleaved_u8u32_mmla_8x12",
     [](const GemmArgs &args) { return args._ci->has_i8mm() && (args._Ksize>8); },
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint32_t>(args); }
-},
-#endif
+),
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
+    GemmMethod::GEMM_HYBRID,
+    "a64_hybrid_u8u32_mmla_6x16",
+    [](const GemmArgs &args) { return args._ci->has_i8mm(); },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
+    [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_mmla_6x16, uint8_t, uint32_t>(args); }
+),
 {
     GemmMethod::GEMM_HYBRID,
     "a64_smallK_hybrid_u8u32_dot_8x4",
@@ -108,27 +120,27 @@ static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
     [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4; },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u16_8x12, uint8_t, uint32_t>(args); },
 },
-{
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "a64_hybrid_u8u32_dot_6x16",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
-    [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint32_t>(args); }
-},
-{
+),
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_u8_8x12",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_u8_8x12, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_8x12, uint8_t, uint32_t>(args); }
-},
-{
+),
+GemmImplementation<uint8_t, uint32_t>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "a64_gemm_u8_4x4",
     nullptr,
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<cls_a64_gemm_u8_4x4, uint8_t, uint32_t>::estimate_cycles<uint32_t>(args); },
     [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_4x4, uint8_t, uint32_t>(args); }
-},
+),
 {
     GemmMethod::DEFAULT,
     "",
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index 12216009d2..4fc9b3456a 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -95,6 +95,18 @@ public:
     void set_pretransposed_B_data(void *buffer) override {
         _subgemm->set_pretransposed_B_data(buffer);
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c = _subgemm->get_config();
+
+        std::string new_filter = "gemv_batched[";
+        new_filter.append(c.filter);
+        new_filter.append("]");
+
+        c.filter = new_filter;
+
+        return c;
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index 9de44fcb73..d4348beabf 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,12 +36,55 @@
 
 namespace arm_gemm {
 
+namespace {
+
+template<typename OutputStage>
+class run_gemv_kernel {
+public:
+    template<typename strategy, typename To, typename Tr>
+    static void run (
+        const strategy &strat,
+        const To *A_ptr, const To *B_ptr, Tr *c_ptr,
+        size_t N, size_t K,
+        const Tr *bias, const Activation &act, bool Accumulate,
+        const OutputStage &os, const int32_t *col_bias, unsigned int col_base
+    );
+};
+
+template<>
+template<typename strategy, typename To, typename Tr>
+void run_gemv_kernel<Nothing>::run(
+        const strategy &strat,
+        const To *A_ptr, const To *B_ptr, Tr *C_ptr,
+        size_t N, size_t K,
+        const Tr *bias, const Activation &act, bool Accumulate,
+        const Nothing &, const int32_t *, unsigned int
+    ) {
+
+    strat.kernel(A_ptr, B_ptr, C_ptr, N, K, bias, act, Accumulate);
+}
+
+template<>
+template<typename strategy, typename To, typename Tr>
+void run_gemv_kernel<Requantize32>::run(
+        const strategy &strat,
+        const To *A_ptr, const To *B_ptr, Tr *C_ptr,
+        size_t N, size_t K,
+        const Tr *, const Activation &, bool,
+        const Requantize32 &qp, const int32_t *col_bias, unsigned int col_base
+    ) {
+
+    strat.kernel(A_ptr, B_ptr, C_ptr, N, K, &qp, col_bias + col_base, col_base);
+}
+
+} // anonymous namespace
+
 // Implementation of the GemmCommon abstract class.
 //
 // This is implementation is for GEMV with pretransposition.
 //
 // batches are not supported as a batched GEMV makes no sense (can be converted to a GEMM).
-template<typename strategy, typename To, typename Tr>
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
 class GemvPretransposed : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
     typedef typename strategy::result_type Tri;
@@ -55,13 +98,28 @@ class GemvPretransposed : public GemmCommon<To, Tr> {
 
     const Toi *_B_pretransposed = nullptr;
 
+    OutputStage _os;
+
+    // Pointer to the column sums (for quantized cases)
+    int32_t *col_bias = nullptr;
+
+    // Get size of the column sums
+    unsigned int get_col_sum_size() const {
+        if(std::is_same<OutputStage, Requantize32>::value) {
+            return _args._Nsize * _args._nmulti * sizeof(int32_t);
+        } else {
+            return 0;
+        }
+    }
+
 public:
     GemvPretransposed(GemvPretransposed &) = delete;
     GemvPretransposed & operator= (GemvPretransposed &) = delete;
 
-    GemvPretransposed(const GemmArgs &args)
+    GemvPretransposed(const GemmArgs &args, const OutputStage &os = {})
                       : _args(args),
-                        _buffer_per_multi(args._Ksize * roundup(args._Nsize, strategy::out_width())) {
+                        _buffer_per_multi(roundup(args._Ksize, strategy::k_unroll()) * roundup(args._Nsize, strategy::out_width())),
+                        _os(os) {
         /* For now don't do any blocking. TODO: figure out if we should. */
         if (strategy::supports_accumulate() && args._cfg && args._cfg->inner_block_size) {
             k_block = args._cfg->inner_block_size;
@@ -117,12 +175,13 @@ public:
 #ifdef CYCLE_PROFILING
                     auto p = prof.ScopedProfiler(PROFILE_KERNEL, (kmax-k0) * (nmax-n));
 #endif
-                    strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + k0,
+                    run_gemv_kernel<OutputStage>::run(strat, this->_Aptr + (multi * this->_A_multi_stride) + k0,
                                  _B_pretransposed + (multi * _buffer_per_multi) + (n * roundup(_args._Ksize, strategy::k_unroll())) + (k0 * strategy::out_width()),
                                  this->_Cptr + (multi * this->_C_multi_stride) + n,
                                  (nmax - n), (kmax-k0),
                                  this->_bias ? this->_bias + (multi * this->_bias_multi_stride) + n : nullptr,
-                                 _args._act, (k0 != 0));
+                                 _args._act, (k0 != 0),
+                                 _os, col_bias, n + (_args._Nsize * multi));
                 }
             }
         }
@@ -139,11 +198,26 @@ public:
     }
 
     size_t get_B_pretransposed_array_size() const override {
-        return _buffer_per_multi * _args._nmulti * sizeof(To);
+        return _buffer_per_multi * _args._nmulti * sizeof(To) + get_col_sum_size();
     }
 
     void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
-        Toi *B_buffer = reinterpret_cast<Toi *>(buffer);
+        // Column sums go on the front of the pretransposed buffer in requantized cases.
+        // We could optimize here in case we don't actually need to sum the columns, but this code is only run on setup.
+        if (std::is_same<OutputStage, Requantize32>::value) {
+            col_bias = reinterpret_cast<int32_t *>(buffer);
+
+            Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
+
+            for (unsigned int i=0; i<_args._nmulti; i++) {
+                compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize, B + (i * B_multi_stride), ldb, col_bias + (i * _args._Nsize), _args._Ksize, i, 0);
+            }
+        }
+
+        // The actual transposed buffer goes after the column sums (if any)
+        uintptr_t buffer_int = reinterpret_cast<uintptr_t>(buffer);
+        Toi *B_buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+
         strategy strat(_args._ci);
 
         for (unsigned int multi=0; multi<_args._nmulti; multi++) {
@@ -156,6 +230,17 @@ public:
     void set_pretransposed_B_data(void *buffer) override {
         _B_pretransposed = reinterpret_cast<Toi *>(buffer);
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c;
+
+        c.method = GemmMethod::GEMV_PRETRANSPOSED;
+        c.inner_block_size = k_block;
+        c.outer_block_size = n_block;
+        c.filter = get_type_name<strategy>();
+
+        return c;
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp
new file mode 100644
index 0000000000..533682c647
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_fp32_bf16.hpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, false>(
+  bfloat16 * &out_ptr, const float * const * in, size_t width, size_t height,
+  size_t row_offset, bool
+)
+{
+  __asm__ __volatile__(
+      "ldr x27, [%x[in], #0x0]\n"
+      "cmp %x[height], #0x8\n"
+      "ldr x26, [%x[in], #0x8]\n"
+      "add x27, x27, %x[row_offset], LSL #2\n"
+      "ldr x25, [%x[in], #0x10]\n"
+      "ldr x24, [%x[in], #0x18]\n"
+      "add x26, x26, %x[row_offset], LSL #2\n"
+      "ldr x23, [%x[in], #0x20]\n"
+      "add x25, x25, %x[row_offset], LSL #2\n"
+      "ldr x22, [%x[in], #0x28]\n"
+      "ldr x21, [%x[in], #0x30]\n"
+      "add x24, x24, %x[row_offset], LSL #2\n"
+      "ldr x20, [%x[in], #0x38]\n"
+      "add x23, x23, %x[row_offset], LSL #2\n"
+      "add x22, x22, %x[row_offset], LSL #2\n"
+      "add x21, x21, %x[row_offset], LSL #2\n"
+      "add x20, x20, %x[row_offset], LSL #2\n"
+      "beq 1f\n"
+      "mov x20, x27\n"
+      "cmp %x[height], #0x2\n"
+      "csel x26, x26, x27, GE\n"
+      "csel x25, x25, x27, GT\n"
+      "cmp %x[height], #0x4\n"
+      "csel x24, x24, x27, GE\n"
+      "csel x23, x23, x27, GT\n"
+      "cmp %x[height], #0x6\n"
+      "csel x22, x22, x27, GE\n"
+      "csel x21, x21, x27, GT\n"
+      "1:"  // no_pointer_adj
+      "prfm pldl1keep, [x27, #0x0]\n"
+      "cmp %x[width], #0x4\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "prfm pldl1keep, [x24, #0x0]\n"
+      "prfm pldl1keep, [x23, #0x0]\n"
+      "prfm pldl1keep, [x22, #0x0]\n"
+      "prfm pldl1keep, [x21, #0x0]\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "prfm pldl1keep, [x27, #0x40]\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x40]\n"
+      "prfm pldl1keep, [x22, #0x40]\n"
+      "prfm pldl1keep, [x21, #0x40]\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "blt 3f\n"
+      "2:"  // Main loop head
+      "ldr q23, [x27], #0x10\n"
+      ".inst 0x0ea16af7  // bfcvtn v23.4h, v23.4s\n"
+      "ldr q22, [x26], #0x10\n"
+      "subs %x[width], %x[width], #0x4\n"
+      ".inst 0x4ea16ad7  // bfcvtn2 v23.8h, v22.4s\n"
+      "ldr q21, [x25], #0x10\n"
+      "cmp %x[width], #0x4\n"
+      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
+      "ldr q20, [x24], #0x10\n"
+      "ldr q18, [x23], #0x10\n"
+      ".inst 0x4ea16a95  // bfcvtn2 v21.8h, v20.4s\n"
+      "ldr q19, [x22], #0x10\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      "ldr q16, [x21], #0x10\n"
+      "ldr q17, [x20], #0x10\n"
+      ".inst 0x4ea16a72  // bfcvtn2 v18.8h, v19.4s\n"
+      "prfm pldl1keep, [x27, #0x70]\n"
+      "prfm pldl1keep, [x26, #0x70]\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "prfm pldl1keep, [x25, #0x70]\n"
+      ".inst 0x4ea16a30  // bfcvtn2 v16.8h, v17.4s\n"
+      "prfm pldl1keep, [x24, #0x70]\n"
+      "prfm pldl1keep, [x23, #0x70]\n"
+      "prfm pldl1keep, [x22, #0x70]\n"
+      "prfm pldl1keep, [x21, #0x70]\n"
+      "prfm pldl1keep, [x20, #0x70]\n"
+      "str q23, [%x[out_ptr], #0x0]\n"
+      "str q21, [%x[out_ptr], #0x10]\n"
+      "str q18, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "bge 2b\n"
+      "3:"  // Main loop skip
+      "cbz %x[width], 6f\n"
+      "tbz %x[width], #1, 4f\n"
+      "ldr d23, [x27], #0x8\n"
+      "ldr d22, [x26], #0x8\n"
+      "mov x19, #0x1\n"
+      "ldr d21, [x25], #0x8\n"
+      "ldr d20, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d16, [x21], #0x8\n"
+      "ldr d17, [x20], #0x8\n"
+      "tbz %x[width], #0, 5f\n"
+      "ld1 { v23.s }[2], [x27]\n"
+      "ld1 { v22.s }[2], [x26]\n"
+      "ld1 { v21.s }[2], [x25]\n"
+      "ld1 { v20.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "ld1 { v16.s }[2], [x21]\n"
+      "ld1 { v17.s }[2], [x20]\n"
+      "b 5f\n"
+      "4:"  // odd_loads_1_0
+      "ldr s23, [x27, #0x0]\n"
+      "mov x19, #0x1\n"
+      "ldr s22, [x26, #0x0]\n"
+      "ldr s21, [x25, #0x0]\n"
+      "ldr s20, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "ldr s16, [x21, #0x0]\n"
+      "ldr s17, [x20, #0x0]\n"
+      "5:"  // Odd load end
+      ".inst 0x0ea16af7  // bfcvtn v23.4h, v23.4s\n"
+      ".inst 0x0ea16ab5  // bfcvtn v21.4h, v21.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x4ea16ad7  // bfcvtn2 v23.8h, v22.4s\n"
+      "str q23, [%x[out_ptr], #0x0]\n"
+      ".inst 0x4ea16a95  // bfcvtn2 v21.8h, v20.4s\n"
+      ".inst 0x4ea16a72  // bfcvtn2 v18.8h, v19.4s\n"
+      "str q21, [%x[out_ptr], #0x10]\n"
+      ".inst 0x4ea16a30  // bfcvtn2 v16.8h, v17.4s\n"
+      "str q18, [%x[out_ptr], #0x20]\n"
+      "str q16, [%x[out_ptr], #0x30]\n"
+      "add %x[out_ptr], %x[out_ptr], #0x40\n"
+      "6:"  // Odds skip
+
+      : [out_ptr] "+&r" (out_ptr), [width] "+&r" (width)
+      : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp
index 52b49c0f0c..b13d32c324 100644
--- a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp
@@ -40,6 +40,7 @@
 #include "a64_interleave8_block2_bf16_bf16.hpp"
 #include "a64_interleave8_block2_fp32_fp32.hpp"
 #include "a64_interleave8_block4_bf16_bf16.hpp"
+#include "a64_interleave8_block4_fp32_bf16.hpp"
 #include "a64_interleave8_block4_s8_s8.hpp"
 #include "a64_interleave8_block4_s8_s8_summing.hpp"
 #include "a64_interleave8_block4_u8_u8_summing.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
index a6b1269927..d5003e4a19 100644
--- a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -348,6 +348,10 @@ template void Interleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_
 template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 4, VLType::None>(bfloat16 *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 #endif // ARM_COMPUTE_ENABLE_BF16
 
 /* Arm® Neon™/SVE using FP32 kernel */
@@ -375,12 +379,10 @@ template void IndirectInterleave<8, 4, VLType::None>(int8_t *, const int8_t * co
 template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 
-#ifdef ARM_COMPUTE_ENABLE_I8MM
 /* MMLA SMMLA (height 8, block 8) */
 template void IndirectInterleave<8, 8, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
 template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
-#endif // ARM_COMPUTE_ENABLE_I8MM
 
 /* Arm® Neon™ SDOT (height 8, block 1) */
 template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
@@ -397,12 +399,10 @@ template void IndirectInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *
 template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 
-#ifdef ARM_COMPUTE_ENABLE_I8MM
 /* MMLA SMMLA (height 8, block 8) */
 template void IndirectInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
 template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
 template void Interleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
-#endif // ARM_COMPUTE_ENABLE_I8MM
 
 /* Arm® Neon™ 16-bit (height 8, block 1) */
 template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index b68a5f518a..b7af7110ab 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #ifdef __aarch64__
 
 #include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
 
 namespace arm_gemm {
 
@@ -58,6 +59,28 @@ public:
     StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
     StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
 
+    template<typename T>
+    static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 3.32, 2.56, 2.63 };
+
+                default:
+                    return { 7.97, 3.72, 7.31 };
+            }
+        }
+
+        if (std::is_same<T, int8_t>::value) {
+            switch(ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 3.33, 2.89, 0.09 };
+                default:
+                    return { 7.97, 3.74, 0.34 };
+            }
+        }
+    }
+
     kern_type kernel=a64_gemm_s8_4x4;
 
     cls_a64_gemm_s8_4x4(const CPUInfo *) { }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
index 7c7b894b08..83ccb4681b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
@@ -61,13 +61,29 @@ public:
     StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
     StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
 
+    template<typename T>
     static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 15.361, 0.9341, 0.1636 };
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 19.73, 2.81, 0.27 };
 
-            default:
-                return { 29.0698, 3.9793, 0.4003 };
+                case CPUModel::A55r1:
+                    return { 15.361, 0.9341, 0.1636 };
+
+                default:
+                    return { 29.0698, 3.9793, 0.4003 };
+            }
+        }
+
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 19.73, 3.41, 3.70 };
+
+                default:
+                    return { 31.81, 3.68, 8.01 };
+            }
         }
     }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index 854b6751c1..07c4769479 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #ifdef __aarch64__
 
+#include "../performance_parameters.hpp"
 #include "../std_transforms_fixed.hpp"
 
 namespace arm_gemm {
@@ -66,6 +67,30 @@ public:
     StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
     StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
 
+    template<typename T>
+    static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 2.64, 2.72, 2.64 };
+
+                default:
+                    return { 7.95, 3.76, 7.27 };
+            }
+        }
+
+        if (std::is_same<T, int8_t>::value) {
+            switch(ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 2.64, 1.79, 0.10 };
+                default:
+                    return { 7.95, 4.09, 0.33 };
+            }
+        }
+
+        return { 0.0 };
+    }
+
     kern_type kernel = a64_gemm_u8_4x4;
 
     cls_a64_gemm_u8_4x4(const CPUInfo *) { }
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
index 00ed5d03bf..0329f57615 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2018,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,14 +69,38 @@ public:
     StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
     StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
 
+    template<typename T>
     static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 15.361, 0.9341, 0.1636 };
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 19.73, 3.38, 0.27 };
 
-            default:
-                return { 29.0698, 3.9793, 0.4003 };
+                case CPUModel::A55r1:
+                    return { 15.361, 0.9341, 0.1636 };
+
+                case CPUModel::V1:
+                    return { 62.40, 4.71, 0.67 };
+
+                default:
+                    return { 29.0698, 3.9793, 0.4003 };
+            }
         }
+
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 19.73, 3.38, 3.70 };
+
+                case CPUModel::V1:
+                    return { 61.58, 4.78, 10.83 };
+
+                default:
+                    return { 31.82, 3.51, 8.03 };
+            }
+        }
+
+        return { 0.0 };
     }
 
     kern_type kernel = a64_gemm_u8_8x12;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp
deleted file mode 100644
index 51a9641af5..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp
+++ /dev/null
@@ -1,1547 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include "arm_gemm.hpp"
-#include "../../utils.hpp"
-
-#include <cassert>
-#include <limits>
-
-namespace arm_gemm {
-
-void a64_gemv_fp32_mla_32 (
-    const float *A_ptr, const float *B_ptr, float *output_ptr,
-    size_t N, size_t K,
-    const float *bias, Activation act, bool
-)
-{
-    struct KernelArgs {
-        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
-        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-        const float *B_ptr = {};
-        size_t output_offset = {};
-        unsigned int input_initial_col = {};
-    } ka;
-
-    unsigned long flags=0;
-    ka.B_ptr = B_ptr;
-    switch(act.type) {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            ka.maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            ka.minval = 0;
-            flags |= 0x2;
-            break;
-    }
-    __asm__ __volatile__(
-      "add x22, %x[N], #0x3\n"
-      "mov x21, %x[bias]\n"
-      "lsr x22, x22, #0x2\n"
-      "1:"  // Column loop
-      "cmp x22, #0x8\n"
-      "bge 85f\n"
-      "cmp x22, #0x6\n"
-      "bgt 73f\n"
-      "beq 61f\n"
-      "cmp x22, #0x4\n"
-      "bgt 49f\n"
-      "beq 37f\n"
-      "cmp x22, #0x2\n"
-      "bgt 25f\n"
-      "beq 13f\n"
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 2f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "add x21, x21, #0x10\n"
-      "b 3f\n"
-      "2:"  // Width 1: no bias
-      "movi v24.16b, #0x0\n"
-      "3:"  // Width 1: setup done
-      "cmp x20, #0x4\n"
-      "blt 6f\n"
-      "cmp x20, #0x8\n"
-      "blt 5f\n"
-      "4:"  // Width 1: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q2, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v2.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q3, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v3.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q4, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v4.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x20, x20, #0x4\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "cmp x20, #0x8\n"
-      "bge 4b\n"
-      "5:"  // Width 1: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q5, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v5.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q6, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v6.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q7, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v7.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q8, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v8.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "6:"  // Width 1: Multiply loop: Main loop skip
-      "cbz x20, 8f\n"
-      "7:"  // Width 1: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q9, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v9.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "sub x20, x20, #0x1\n"
-      "cbnz x20, 7b\n"
-      "8:"  // Width 1: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 9f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "9:"  // Width 1: No activation
-      "cmp %x[N], #0x4\n"
-      "blt 10f\n"
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 12f\n"
-      "10:"  // Width 1: Partial writeback
-      "tbz %x[N], #1, 11f\n"
-      "str d24, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 12f\n"
-      "st1 { v24.s }[2], [%x[output_ptr]]\n"
-      "b 12f\n"
-      "11:"  // Width 1: Partial direct writeback: partial_1_0
-      "str s24, [%x[output_ptr], #0x0]\n"
-      "12:"  // Width 1: Writeback done
-      "b 97f\n"
-      "13:"  // Width 2
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 14f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "add x21, x21, #0x20\n"
-      "b 15f\n"
-      "14:"  // Width 2: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "15:"  // Width 2: setup done
-      "cmp x20, #0x4\n"
-      "blt 18f\n"
-      "cmp x20, #0x8\n"
-      "blt 17f\n"
-      "16:"  // Width 2: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q3, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v3.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q4, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v4.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q5, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v5.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q6, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v6.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q7, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v7.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q8, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v8.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x20, x20, #0x4\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "cmp x20, #0x8\n"
-      "bge 16b\n"
-      "17:"  // Width 2: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q9, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v9.4s, v0.s[0]\n"
-      "ldr q10, [%x[B_ptr], #0x10]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v25.4s, v10.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q11, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v11.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q12, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v12.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q13, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v13.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q14, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v14.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q15, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v15.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q16, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v16.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "18:"  // Width 2: Multiply loop: Main loop skip
-      "cbz x20, 20f\n"
-      "19:"  // Width 2: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q17, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v17.4s, v0.s[0]\n"
-      "ldr q18, [%x[B_ptr], #0x10]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v25.4s, v18.4s, v0.s[0]\n"
-      "sub x20, x20, #0x1\n"
-      "cbnz x20, 19b\n"
-      "20:"  // Width 2: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 21f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "21:"  // Width 2: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "cmp %x[N], #0x8\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "blt 22f\n"
-      "str q25, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 24f\n"
-      "22:"  // Width 2: Partial writeback
-      "tbz %x[N], #1, 23f\n"
-      "str d25, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 24f\n"
-      "st1 { v25.s }[2], [%x[output_ptr]]\n"
-      "b 24f\n"
-      "23:"  // Width 2: Partial direct writeback: partial_1_4
-      "tbz %x[N], #0, 24f\n"
-      "str s25, [%x[output_ptr], #0x0]\n"
-      "24:"  // Width 2: Writeback done
-      "b 97f\n"
-      "25:"  // Width 3
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 26f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "add x21, x21, #0x30\n"
-      "b 27f\n"
-      "26:"  // Width 3: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v26.16b, #0x0\n"
-      "27:"  // Width 3: setup done
-      "cmp x20, #0x4\n"
-      "blt 30f\n"
-      "cmp x20, #0x8\n"
-      "blt 29f\n"
-      "28:"  // Width 3: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "ldr q3, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v26.4s, v3.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v4.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q5, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v5.4s, v0.s[1]\n"
-      "ldr q6, [%x[B_ptr], #0x20]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v6.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q7, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v7.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q8, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v8.4s, v0.s[2]\n"
-      "ldr q9, [%x[B_ptr], #0x20]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v9.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q10, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v10.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q11, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v11.4s, v0.s[3]\n"
-      "ldr q12, [%x[B_ptr], #0x20]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v12.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x20, x20, #0x4\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "cmp x20, #0x8\n"
-      "bge 28b\n"
-      "29:"  // Width 3: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q13, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v13.4s, v0.s[0]\n"
-      "ldr q14, [%x[B_ptr], #0x10]\n"
-      "ldr q15, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v14.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v26.4s, v15.4s, v0.s[0]\n"
-      "ldr q16, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v16.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q17, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v17.4s, v0.s[1]\n"
-      "ldr q18, [%x[B_ptr], #0x20]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v18.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q19, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v19.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q20, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v20.4s, v0.s[2]\n"
-      "ldr q21, [%x[B_ptr], #0x20]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v21.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q22, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v22.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q23, [%x[B_ptr], #0x10]\n"
-      "fmla v25.4s, v23.4s, v0.s[3]\n"
-      "ldr q1, [%x[B_ptr], #0x20]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v1.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "30:"  // Width 3: Multiply loop: Main loop skip
-      "cbz x20, 32f\n"
-      "31:"  // Width 3: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q2, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v2.4s, v0.s[0]\n"
-      "ldr q3, [%x[B_ptr], #0x10]\n"
-      "ldr q4, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v3.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v4.4s, v0.s[0]\n"
-      "sub x20, x20, #0x1\n"
-      "cbnz x20, 31b\n"
-      "32:"  // Width 3: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 33f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "33:"  // Width 3: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "str q25, [%x[output_ptr], #0x10]\n"
-      "cmp %x[N], #0xc\n"
-      "add %x[output_ptr], %x[output_ptr], #0x20\n"
-      "blt 34f\n"
-      "str q26, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 36f\n"
-      "34:"  // Width 3: Partial writeback
-      "tbz %x[N], #1, 35f\n"
-      "str d26, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 36f\n"
-      "st1 { v26.s }[2], [%x[output_ptr]]\n"
-      "b 36f\n"
-      "35:"  // Width 3: Partial direct writeback: partial_1_8
-      "tbz %x[N], #0, 36f\n"
-      "str s26, [%x[output_ptr], #0x0]\n"
-      "36:"  // Width 3: Writeback done
-      "b 97f\n"
-      "37:"  // Width 4
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 38f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
-      "add x21, x21, #0x40\n"
-      "b 39f\n"
-      "38:"  // Width 4: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v26.16b, #0x0\n"
-      "movi v27.16b, #0x0\n"
-      "39:"  // Width 4: setup done
-      "cmp x20, #0x4\n"
-      "blt 42f\n"
-      "cmp x20, #0x8\n"
-      "blt 41f\n"
-      "40:"  // Width 4: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "ldr q3, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v3.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v4.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q5, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v5.4s, v0.s[1]\n"
-      "ldr q6, [%x[B_ptr], #0x10]\n"
-      "ldr q7, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v6.4s, v0.s[1]\n"
-      "ldr q8, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v7.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v8.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q9, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v9.4s, v0.s[2]\n"
-      "ldr q10, [%x[B_ptr], #0x10]\n"
-      "ldr q11, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v10.4s, v0.s[2]\n"
-      "ldr q12, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v11.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v12.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q13, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v13.4s, v0.s[3]\n"
-      "ldr q14, [%x[B_ptr], #0x10]\n"
-      "ldr q15, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v14.4s, v0.s[3]\n"
-      "ldr q16, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v15.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v16.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "sub x20, x20, #0x4\n"
-      "cmp x20, #0x8\n"
-      "bge 40b\n"
-      "41:"  // Width 4: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q17, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v17.4s, v0.s[0]\n"
-      "ldr q18, [%x[B_ptr], #0x10]\n"
-      "ldr q19, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v18.4s, v0.s[0]\n"
-      "ldr q20, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v19.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v20.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q21, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v21.4s, v0.s[1]\n"
-      "ldr q22, [%x[B_ptr], #0x10]\n"
-      "ldr q23, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v22.4s, v0.s[1]\n"
-      "ldr q1, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v23.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v1.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q2, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v2.4s, v0.s[2]\n"
-      "ldr q3, [%x[B_ptr], #0x10]\n"
-      "ldr q4, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v3.4s, v0.s[2]\n"
-      "ldr q5, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v4.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v5.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q6, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v6.4s, v0.s[3]\n"
-      "ldr q7, [%x[B_ptr], #0x10]\n"
-      "ldr q8, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v7.4s, v0.s[3]\n"
-      "ldr q9, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v8.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v9.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add x19, x19, #0x10\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "42:"  // Width 4: Multiply loop: Main loop skip
-      "cbz x20, 44f\n"
-      "43:"  // Width 4: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q10, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v10.4s, v0.s[0]\n"
-      "ldr q11, [%x[B_ptr], #0x10]\n"
-      "ldr q12, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v11.4s, v0.s[0]\n"
-      "ldr q13, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v12.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "sub x20, x20, #0x1\n"
-      "fmla v27.4s, v13.4s, v0.s[0]\n"
-      "cbnz x20, 43b\n"
-      "44:"  // Width 4: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 45f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "45:"  // Width 4: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "str q25, [%x[output_ptr], #0x10]\n"
-      "str q26, [%x[output_ptr], #0x20]\n"
-      "cmp %x[N], #0x10\n"
-      "add %x[output_ptr], %x[output_ptr], #0x30\n"
-      "blt 46f\n"
-      "str q27, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 48f\n"
-      "46:"  // Width 4: Partial writeback
-      "tbz %x[N], #1, 47f\n"
-      "str d27, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 48f\n"
-      "st1 { v27.s }[2], [%x[output_ptr]]\n"
-      "b 48f\n"
-      "47:"  // Width 4: Partial direct writeback: partial_1_12
-      "tbz %x[N], #0, 48f\n"
-      "str s27, [%x[output_ptr], #0x0]\n"
-      "48:"  // Width 4: Writeback done
-      "b 97f\n"
-      "49:"  // Width 5
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 50f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
-      "ldr q28, [x21, #0x40]\n"
-      "add x21, x21, #0x50\n"
-      "b 51f\n"
-      "50:"  // Width 5: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v26.16b, #0x0\n"
-      "movi v27.16b, #0x0\n"
-      "movi v28.16b, #0x0\n"
-      "51:"  // Width 5: setup done
-      "cmp x20, #0x4\n"
-      "blt 54f\n"
-      "cmp x20, #0x8\n"
-      "blt 53f\n"
-      "52:"  // Width 5: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "ldr q3, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v3.4s, v0.s[0]\n"
-      "ldr q5, [%x[B_ptr], #0x40]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v27.4s, v4.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q6, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v5.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q7, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v6.4s, v0.s[1]\n"
-      "ldr q8, [%x[B_ptr], #0x20]\n"
-      "ldr q9, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v7.4s, v0.s[1]\n"
-      "ldr q10, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v8.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v9.4s, v0.s[1]\n"
-      "ldr q11, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v10.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q12, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v11.4s, v0.s[2]\n"
-      "ldr q13, [%x[B_ptr], #0x20]\n"
-      "ldr q14, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v12.4s, v0.s[2]\n"
-      "ldr q15, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v13.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v14.4s, v0.s[2]\n"
-      "ldr q16, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v15.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q17, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v16.4s, v0.s[3]\n"
-      "ldr q18, [%x[B_ptr], #0x20]\n"
-      "ldr q19, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v17.4s, v0.s[3]\n"
-      "ldr q20, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v18.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v19.4s, v0.s[3]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v28.4s, v20.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x20, x20, #0x4\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "cmp x20, #0x8\n"
-      "bge 52b\n"
-      "53:"  // Width 5: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q21, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v21.4s, v0.s[0]\n"
-      "ldr q22, [%x[B_ptr], #0x10]\n"
-      "ldr q23, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v22.4s, v0.s[0]\n"
-      "ldr q1, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v23.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x40]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v27.4s, v1.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "ldr q3, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v2.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q4, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v3.4s, v0.s[1]\n"
-      "ldr q5, [%x[B_ptr], #0x20]\n"
-      "ldr q6, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v4.4s, v0.s[1]\n"
-      "ldr q7, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v5.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v6.4s, v0.s[1]\n"
-      "ldr q8, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v7.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q9, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v8.4s, v0.s[2]\n"
-      "ldr q10, [%x[B_ptr], #0x20]\n"
-      "ldr q11, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v9.4s, v0.s[2]\n"
-      "ldr q12, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v10.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v11.4s, v0.s[2]\n"
-      "ldr q13, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v12.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q14, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v13.4s, v0.s[3]\n"
-      "ldr q15, [%x[B_ptr], #0x20]\n"
-      "ldr q16, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v14.4s, v0.s[3]\n"
-      "ldr q17, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v15.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v16.4s, v0.s[3]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v28.4s, v17.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "54:"  // Width 5: Multiply loop: Main loop skip
-      "cbz x20, 56f\n"
-      "55:"  // Width 5: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q18, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v18.4s, v0.s[0]\n"
-      "ldr q19, [%x[B_ptr], #0x10]\n"
-      "ldr q20, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v19.4s, v0.s[0]\n"
-      "ldr q21, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v20.4s, v0.s[0]\n"
-      "ldr q22, [%x[B_ptr], #0x40]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v27.4s, v21.4s, v0.s[0]\n"
-      "sub x20, x20, #0x1\n"
-      "fmla v28.4s, v22.4s, v0.s[0]\n"
-      "cbnz x20, 55b\n"
-      "56:"  // Width 5: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 57f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "57:"  // Width 5: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "str q25, [%x[output_ptr], #0x10]\n"
-      "str q26, [%x[output_ptr], #0x20]\n"
-      "str q27, [%x[output_ptr], #0x30]\n"
-      "cmp %x[N], #0x14\n"
-      "add %x[output_ptr], %x[output_ptr], #0x40\n"
-      "blt 58f\n"
-      "str q28, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 60f\n"
-      "58:"  // Width 5: Partial writeback
-      "tbz %x[N], #1, 59f\n"
-      "str d28, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 60f\n"
-      "st1 { v28.s }[2], [%x[output_ptr]]\n"
-      "b 60f\n"
-      "59:"  // Width 5: Partial direct writeback: partial_1_16
-      "tbz %x[N], #0, 60f\n"
-      "str s28, [%x[output_ptr], #0x0]\n"
-      "60:"  // Width 5: Writeback done
-      "b 97f\n"
-      "61:"  // Width 6
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 62f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
-      "ldr q28, [x21, #0x40]\n"
-      "ldr q29, [x21, #0x50]\n"
-      "add x21, x21, #0x60\n"
-      "b 63f\n"
-      "62:"  // Width 6: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v26.16b, #0x0\n"
-      "movi v27.16b, #0x0\n"
-      "movi v28.16b, #0x0\n"
-      "movi v29.16b, #0x0\n"
-      "63:"  // Width 6: setup done
-      "cmp x20, #0x4\n"
-      "blt 66f\n"
-      "cmp x20, #0x8\n"
-      "blt 65f\n"
-      "64:"  // Width 6: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "ldr q3, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v3.4s, v0.s[0]\n"
-      "ldr q5, [%x[B_ptr], #0x40]\n"
-      "ldr q6, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v4.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v5.4s, v0.s[0]\n"
-      "ldr q7, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v6.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q8, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v7.4s, v0.s[1]\n"
-      "ldr q9, [%x[B_ptr], #0x20]\n"
-      "ldr q10, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v8.4s, v0.s[1]\n"
-      "ldr q11, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v9.4s, v0.s[1]\n"
-      "ldr q12, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v10.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v11.4s, v0.s[1]\n"
-      "ldr q13, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v12.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q14, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v13.4s, v0.s[2]\n"
-      "ldr q15, [%x[B_ptr], #0x20]\n"
-      "ldr q16, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v14.4s, v0.s[2]\n"
-      "ldr q17, [%x[B_ptr], #0x40]\n"
-      "ldr q18, [%x[B_ptr], #0x50]\n"
-      "fmla v26.4s, v15.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v16.4s, v0.s[2]\n"
-      "ldr q19, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v17.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q20, [%x[B_ptr], #0x10]\n"
-      "fmla v29.4s, v18.4s, v0.s[2]\n"
-      "ldr q21, [%x[B_ptr], #0x20]\n"
-      "ldr q22, [%x[B_ptr], #0x30]\n"
-      "fmla v24.4s, v19.4s, v0.s[3]\n"
-      "ldr q23, [%x[B_ptr], #0x40]\n"
-      "ldr q1, [%x[B_ptr], #0x50]\n"
-      "fmla v25.4s, v20.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v26.4s, v21.4s, v0.s[3]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v27.4s, v22.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x20, x20, #0x4\n"
-      "fmla v28.4s, v23.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "cmp x20, #0x8\n"
-      "fmla v29.4s, v1.4s, v0.s[3]\n"
-      "bge 64b\n"
-      "65:"  // Width 6: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q2, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v2.4s, v0.s[0]\n"
-      "ldr q3, [%x[B_ptr], #0x10]\n"
-      "ldr q4, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v3.4s, v0.s[0]\n"
-      "ldr q5, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v4.4s, v0.s[0]\n"
-      "ldr q6, [%x[B_ptr], #0x40]\n"
-      "ldr q7, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v5.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v6.4s, v0.s[0]\n"
-      "ldr q8, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q9, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v8.4s, v0.s[1]\n"
-      "ldr q10, [%x[B_ptr], #0x20]\n"
-      "ldr q11, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v9.4s, v0.s[1]\n"
-      "ldr q12, [%x[B_ptr], #0x40]\n"
-      "fmla v26.4s, v10.4s, v0.s[1]\n"
-      "ldr q13, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v11.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v12.4s, v0.s[1]\n"
-      "ldr q14, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v13.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q15, [%x[B_ptr], #0x10]\n"
-      "fmla v24.4s, v14.4s, v0.s[2]\n"
-      "ldr q16, [%x[B_ptr], #0x20]\n"
-      "ldr q17, [%x[B_ptr], #0x30]\n"
-      "fmla v25.4s, v15.4s, v0.s[2]\n"
-      "ldr q18, [%x[B_ptr], #0x40]\n"
-      "ldr q19, [%x[B_ptr], #0x50]\n"
-      "fmla v26.4s, v16.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v17.4s, v0.s[2]\n"
-      "ldr q20, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v18.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q21, [%x[B_ptr], #0x10]\n"
-      "fmla v29.4s, v19.4s, v0.s[2]\n"
-      "ldr q22, [%x[B_ptr], #0x20]\n"
-      "ldr q23, [%x[B_ptr], #0x30]\n"
-      "fmla v24.4s, v20.4s, v0.s[3]\n"
-      "ldr q1, [%x[B_ptr], #0x40]\n"
-      "ldr q2, [%x[B_ptr], #0x50]\n"
-      "fmla v25.4s, v21.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v26.4s, v22.4s, v0.s[3]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v27.4s, v23.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "fmla v28.4s, v1.4s, v0.s[3]\n"
-      "fmla v29.4s, v2.4s, v0.s[3]\n"
-      "66:"  // Width 6: Multiply loop: Main loop skip
-      "cbz x20, 68f\n"
-      "67:"  // Width 6: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q3, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v3.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x10]\n"
-      "ldr q5, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v4.4s, v0.s[0]\n"
-      "ldr q6, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v5.4s, v0.s[0]\n"
-      "ldr q7, [%x[B_ptr], #0x40]\n"
-      "ldr q8, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v6.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "sub x20, x20, #0x1\n"
-      "fmla v28.4s, v7.4s, v0.s[0]\n"
-      "fmla v29.4s, v8.4s, v0.s[0]\n"
-      "cbnz x20, 67b\n"
-      "68:"  // Width 6: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 69f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmin v29.4s, v29.4s, v16.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
-      "69:"  // Width 6: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "str q25, [%x[output_ptr], #0x10]\n"
-      "str q26, [%x[output_ptr], #0x20]\n"
-      "str q27, [%x[output_ptr], #0x30]\n"
-      "str q28, [%x[output_ptr], #0x40]\n"
-      "cmp %x[N], #0x18\n"
-      "add %x[output_ptr], %x[output_ptr], #0x50\n"
-      "blt 70f\n"
-      "str q29, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 72f\n"
-      "70:"  // Width 6: Partial writeback
-      "tbz %x[N], #1, 71f\n"
-      "str d29, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 72f\n"
-      "st1 { v29.s }[2], [%x[output_ptr]]\n"
-      "b 72f\n"
-      "71:"  // Width 6: Partial direct writeback: partial_1_20
-      "tbz %x[N], #0, 72f\n"
-      "str s29, [%x[output_ptr], #0x0]\n"
-      "72:"  // Width 6: Writeback done
-      "b 97f\n"
-      "73:"  // Width 7
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 74f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
-      "ldr q28, [x21, #0x40]\n"
-      "ldr q29, [x21, #0x50]\n"
-      "ldr q30, [x21, #0x60]\n"
-      "add x21, x21, #0x70\n"
-      "b 75f\n"
-      "74:"  // Width 7: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v26.16b, #0x0\n"
-      "movi v27.16b, #0x0\n"
-      "movi v28.16b, #0x0\n"
-      "movi v29.16b, #0x0\n"
-      "movi v30.16b, #0x0\n"
-      "75:"  // Width 7: setup done
-      "cmp x20, #0x4\n"
-      "blt 78f\n"
-      "cmp x20, #0x8\n"
-      "blt 77f\n"
-      "76:"  // Width 7: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "ldr q3, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v3.4s, v0.s[0]\n"
-      "ldr q5, [%x[B_ptr], #0x40]\n"
-      "ldr q6, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v4.4s, v0.s[0]\n"
-      "ldr q7, [%x[B_ptr], #0x60]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v28.4s, v5.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v29.4s, v6.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q8, [%x[B_ptr], #0x0]\n"
-      "fmla v30.4s, v7.4s, v0.s[0]\n"
-      "ldr q9, [%x[B_ptr], #0x10]\n"
-      "ldr q10, [%x[B_ptr], #0x20]\n"
-      "fmla v24.4s, v8.4s, v0.s[1]\n"
-      "ldr q11, [%x[B_ptr], #0x30]\n"
-      "ldr q12, [%x[B_ptr], #0x40]\n"
-      "fmla v25.4s, v9.4s, v0.s[1]\n"
-      "ldr q13, [%x[B_ptr], #0x50]\n"
-      "fmla v26.4s, v10.4s, v0.s[1]\n"
-      "ldr q14, [%x[B_ptr], #0x60]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v27.4s, v11.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v12.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q15, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v13.4s, v0.s[1]\n"
-      "ldr q16, [%x[B_ptr], #0x10]\n"
-      "ldr q17, [%x[B_ptr], #0x20]\n"
-      "fmla v30.4s, v14.4s, v0.s[1]\n"
-      "ldr q18, [%x[B_ptr], #0x30]\n"
-      "fmla v24.4s, v15.4s, v0.s[2]\n"
-      "ldr q19, [%x[B_ptr], #0x40]\n"
-      "ldr q20, [%x[B_ptr], #0x50]\n"
-      "fmla v25.4s, v16.4s, v0.s[2]\n"
-      "ldr q21, [%x[B_ptr], #0x60]\n"
-      "fmla v26.4s, v17.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v18.4s, v0.s[2]\n"
-      "ldr q22, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v19.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q23, [%x[B_ptr], #0x10]\n"
-      "fmla v29.4s, v20.4s, v0.s[2]\n"
-      "ldr q1, [%x[B_ptr], #0x20]\n"
-      "ldr q2, [%x[B_ptr], #0x30]\n"
-      "fmla v30.4s, v21.4s, v0.s[2]\n"
-      "ldr q3, [%x[B_ptr], #0x40]\n"
-      "fmla v24.4s, v22.4s, v0.s[3]\n"
-      "ldr q4, [%x[B_ptr], #0x50]\n"
-      "ldr q5, [%x[B_ptr], #0x60]\n"
-      "fmla v25.4s, v23.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v1.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v2.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v28.4s, v3.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "sub x20, x20, #0x4\n"
-      "fmla v29.4s, v4.4s, v0.s[3]\n"
-      "cmp x20, #0x8\n"
-      "fmla v30.4s, v5.4s, v0.s[3]\n"
-      "bge 76b\n"
-      "77:"  // Width 7: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q6, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v6.4s, v0.s[0]\n"
-      "ldr q7, [%x[B_ptr], #0x10]\n"
-      "ldr q8, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v7.4s, v0.s[0]\n"
-      "ldr q9, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v8.4s, v0.s[0]\n"
-      "ldr q10, [%x[B_ptr], #0x40]\n"
-      "ldr q11, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v9.4s, v0.s[0]\n"
-      "ldr q12, [%x[B_ptr], #0x60]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v28.4s, v10.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v29.4s, v11.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q13, [%x[B_ptr], #0x0]\n"
-      "fmla v30.4s, v12.4s, v0.s[0]\n"
-      "ldr q14, [%x[B_ptr], #0x10]\n"
-      "ldr q15, [%x[B_ptr], #0x20]\n"
-      "fmla v24.4s, v13.4s, v0.s[1]\n"
-      "ldr q16, [%x[B_ptr], #0x30]\n"
-      "ldr q17, [%x[B_ptr], #0x40]\n"
-      "fmla v25.4s, v14.4s, v0.s[1]\n"
-      "ldr q18, [%x[B_ptr], #0x50]\n"
-      "fmla v26.4s, v15.4s, v0.s[1]\n"
-      "ldr q19, [%x[B_ptr], #0x60]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v27.4s, v16.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v17.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q20, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v18.4s, v0.s[1]\n"
-      "ldr q21, [%x[B_ptr], #0x10]\n"
-      "ldr q22, [%x[B_ptr], #0x20]\n"
-      "fmla v30.4s, v19.4s, v0.s[1]\n"
-      "ldr q23, [%x[B_ptr], #0x30]\n"
-      "fmla v24.4s, v20.4s, v0.s[2]\n"
-      "ldr q1, [%x[B_ptr], #0x40]\n"
-      "ldr q2, [%x[B_ptr], #0x50]\n"
-      "fmla v25.4s, v21.4s, v0.s[2]\n"
-      "ldr q3, [%x[B_ptr], #0x60]\n"
-      "fmla v26.4s, v22.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v23.4s, v0.s[2]\n"
-      "ldr q4, [%x[B_ptr], #0x0]\n"
-      "fmla v28.4s, v1.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q5, [%x[B_ptr], #0x10]\n"
-      "fmla v29.4s, v2.4s, v0.s[2]\n"
-      "ldr q6, [%x[B_ptr], #0x20]\n"
-      "ldr q7, [%x[B_ptr], #0x30]\n"
-      "fmla v30.4s, v3.4s, v0.s[2]\n"
-      "ldr q8, [%x[B_ptr], #0x40]\n"
-      "fmla v24.4s, v4.4s, v0.s[3]\n"
-      "ldr q9, [%x[B_ptr], #0x50]\n"
-      "ldr q10, [%x[B_ptr], #0x60]\n"
-      "fmla v25.4s, v5.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v26.4s, v6.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v27.4s, v7.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v28.4s, v8.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "fmla v29.4s, v9.4s, v0.s[3]\n"
-      "fmla v30.4s, v10.4s, v0.s[3]\n"
-      "78:"  // Width 7: Multiply loop: Main loop skip
-      "cbz x20, 80f\n"
-      "79:"  // Width 7: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q11, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v11.4s, v0.s[0]\n"
-      "ldr q12, [%x[B_ptr], #0x10]\n"
-      "ldr q13, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v12.4s, v0.s[0]\n"
-      "ldr q14, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v13.4s, v0.s[0]\n"
-      "ldr q15, [%x[B_ptr], #0x40]\n"
-      "ldr q16, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v14.4s, v0.s[0]\n"
-      "ldr q17, [%x[B_ptr], #0x60]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "fmla v28.4s, v15.4s, v0.s[0]\n"
-      "fmla v29.4s, v16.4s, v0.s[0]\n"
-      "sub x20, x20, #0x1\n"
-      "fmla v30.4s, v17.4s, v0.s[0]\n"
-      "cbnz x20, 79b\n"
-      "80:"  // Width 7: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 81f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmin v29.4s, v29.4s, v16.4s\n"
-      "fmin v30.4s, v30.4s, v16.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
-      "fmax v30.4s, v30.4s, v17.4s\n"
-      "81:"  // Width 7: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "str q25, [%x[output_ptr], #0x10]\n"
-      "str q26, [%x[output_ptr], #0x20]\n"
-      "str q27, [%x[output_ptr], #0x30]\n"
-      "str q28, [%x[output_ptr], #0x40]\n"
-      "str q29, [%x[output_ptr], #0x50]\n"
-      "cmp %x[N], #0x1c\n"
-      "add %x[output_ptr], %x[output_ptr], #0x60\n"
-      "blt 82f\n"
-      "str q30, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 84f\n"
-      "82:"  // Width 7: Partial writeback
-      "tbz %x[N], #1, 83f\n"
-      "str d30, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 84f\n"
-      "st1 { v30.s }[2], [%x[output_ptr]]\n"
-      "b 84f\n"
-      "83:"  // Width 7: Partial direct writeback: partial_1_24
-      "tbz %x[N], #0, 84f\n"
-      "str s30, [%x[output_ptr], #0x0]\n"
-      "84:"  // Width 7: Writeback done
-      "b 97f\n"
-      "85:"  // Width 8
-      "mov x20, %x[K]\n"
-      "mov x19, %x[A_ptr]\n"
-      "cbz x21, 86f\n"
-      "ldr q24, [x21, #0x0]\n"
-      "ldr q25, [x21, #0x10]\n"
-      "ldr q26, [x21, #0x20]\n"
-      "ldr q27, [x21, #0x30]\n"
-      "ldr q28, [x21, #0x40]\n"
-      "ldr q29, [x21, #0x50]\n"
-      "ldr q30, [x21, #0x60]\n"
-      "ldr q31, [x21, #0x70]\n"
-      "add x21, x21, #0x80\n"
-      "b 87f\n"
-      "86:"  // Width 8: no bias
-      "movi v24.16b, #0x0\n"
-      "movi v25.16b, #0x0\n"
-      "movi v26.16b, #0x0\n"
-      "movi v27.16b, #0x0\n"
-      "movi v28.16b, #0x0\n"
-      "movi v29.16b, #0x0\n"
-      "movi v30.16b, #0x0\n"
-      "movi v31.16b, #0x0\n"
-      "87:"  // Width 8: setup done
-      "cmp x20, #0x4\n"
-      "blt 90f\n"
-      "cmp x20, #0x8\n"
-      "blt 89f\n"
-      "88:"  // Width 8: Multiply loop: Main loop head
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q1, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v1.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x10]\n"
-      "ldr q3, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v2.4s, v0.s[0]\n"
-      "ldr q4, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v3.4s, v0.s[0]\n"
-      "ldr q5, [%x[B_ptr], #0x40]\n"
-      "ldr q6, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v4.4s, v0.s[0]\n"
-      "ldr q7, [%x[B_ptr], #0x60]\n"
-      "ldr q8, [%x[B_ptr], #0x70]\n"
-      "fmla v28.4s, v5.4s, v0.s[0]\n"
-      "fmla v29.4s, v6.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v30.4s, v7.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q9, [%x[B_ptr], #0x0]\n"
-      "fmla v31.4s, v8.4s, v0.s[0]\n"
-      "ldr q10, [%x[B_ptr], #0x10]\n"
-      "ldr q11, [%x[B_ptr], #0x20]\n"
-      "fmla v24.4s, v9.4s, v0.s[1]\n"
-      "ldr q12, [%x[B_ptr], #0x30]\n"
-      "ldr q13, [%x[B_ptr], #0x40]\n"
-      "fmla v25.4s, v10.4s, v0.s[1]\n"
-      "fmla v26.4s, v11.4s, v0.s[1]\n"
-      "ldr q14, [%x[B_ptr], #0x50]\n"
-      "ldr q15, [%x[B_ptr], #0x60]\n"
-      "fmla v27.4s, v12.4s, v0.s[1]\n"
-      "ldr q16, [%x[B_ptr], #0x70]\n"
-      "fmla v28.4s, v13.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v29.4s, v14.4s, v0.s[1]\n"
-      "ldr q17, [%x[B_ptr], #0x0]\n"
-      "fmla v30.4s, v15.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q18, [%x[B_ptr], #0x10]\n"
-      "fmla v31.4s, v16.4s, v0.s[1]\n"
-      "ldr q19, [%x[B_ptr], #0x20]\n"
-      "ldr q20, [%x[B_ptr], #0x30]\n"
-      "fmla v24.4s, v17.4s, v0.s[2]\n"
-      "ldr q21, [%x[B_ptr], #0x40]\n"
-      "ldr q22, [%x[B_ptr], #0x50]\n"
-      "fmla v25.4s, v18.4s, v0.s[2]\n"
-      "ldr q23, [%x[B_ptr], #0x60]\n"
-      "fmla v26.4s, v19.4s, v0.s[2]\n"
-      "ldr q1, [%x[B_ptr], #0x70]\n"
-      "fmla v27.4s, v20.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v21.4s, v0.s[2]\n"
-      "ldr q2, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v22.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q3, [%x[B_ptr], #0x10]\n"
-      "fmla v30.4s, v23.4s, v0.s[2]\n"
-      "ldr q4, [%x[B_ptr], #0x20]\n"
-      "ldr q5, [%x[B_ptr], #0x30]\n"
-      "fmla v31.4s, v1.4s, v0.s[2]\n"
-      "ldr q6, [%x[B_ptr], #0x40]\n"
-      "fmla v24.4s, v2.4s, v0.s[3]\n"
-      "ldr q7, [%x[B_ptr], #0x50]\n"
-      "ldr q8, [%x[B_ptr], #0x60]\n"
-      "fmla v25.4s, v3.4s, v0.s[3]\n"
-      "ldr q9, [%x[B_ptr], #0x70]\n"
-      "fmla v26.4s, v4.4s, v0.s[3]\n"
-      "fmla v27.4s, v5.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v6.4s, v0.s[3]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v29.4s, v7.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x20, x20, #0x4\n"
-      "fmla v30.4s, v8.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "cmp x20, #0x8\n"
-      "fmla v31.4s, v9.4s, v0.s[3]\n"
-      "bge 88b\n"
-      "89:"  // Width 8: Multiply loop: Single iteration only
-      "sub x20, x20, #0x4\n"
-      "ldr q0, [x19, #0x0]\n"
-      "ldr q10, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v10.4s, v0.s[0]\n"
-      "ldr q11, [%x[B_ptr], #0x10]\n"
-      "ldr q12, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v11.4s, v0.s[0]\n"
-      "ldr q13, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v12.4s, v0.s[0]\n"
-      "ldr q14, [%x[B_ptr], #0x40]\n"
-      "ldr q15, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v13.4s, v0.s[0]\n"
-      "ldr q16, [%x[B_ptr], #0x60]\n"
-      "ldr q17, [%x[B_ptr], #0x70]\n"
-      "fmla v28.4s, v14.4s, v0.s[0]\n"
-      "fmla v29.4s, v15.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v30.4s, v16.4s, v0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q18, [%x[B_ptr], #0x0]\n"
-      "fmla v31.4s, v17.4s, v0.s[0]\n"
-      "ldr q19, [%x[B_ptr], #0x10]\n"
-      "ldr q20, [%x[B_ptr], #0x20]\n"
-      "fmla v24.4s, v18.4s, v0.s[1]\n"
-      "ldr q21, [%x[B_ptr], #0x30]\n"
-      "ldr q22, [%x[B_ptr], #0x40]\n"
-      "fmla v25.4s, v19.4s, v0.s[1]\n"
-      "fmla v26.4s, v20.4s, v0.s[1]\n"
-      "ldr q23, [%x[B_ptr], #0x50]\n"
-      "ldr q1, [%x[B_ptr], #0x60]\n"
-      "fmla v27.4s, v21.4s, v0.s[1]\n"
-      "ldr q2, [%x[B_ptr], #0x70]\n"
-      "fmla v28.4s, v22.4s, v0.s[1]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v29.4s, v23.4s, v0.s[1]\n"
-      "ldr q3, [%x[B_ptr], #0x0]\n"
-      "fmla v30.4s, v1.4s, v0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q4, [%x[B_ptr], #0x10]\n"
-      "fmla v31.4s, v2.4s, v0.s[1]\n"
-      "ldr q5, [%x[B_ptr], #0x20]\n"
-      "ldr q6, [%x[B_ptr], #0x30]\n"
-      "fmla v24.4s, v3.4s, v0.s[2]\n"
-      "ldr q7, [%x[B_ptr], #0x40]\n"
-      "ldr q8, [%x[B_ptr], #0x50]\n"
-      "fmla v25.4s, v4.4s, v0.s[2]\n"
-      "ldr q9, [%x[B_ptr], #0x60]\n"
-      "fmla v26.4s, v5.4s, v0.s[2]\n"
-      "ldr q10, [%x[B_ptr], #0x70]\n"
-      "fmla v27.4s, v6.4s, v0.s[2]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v7.4s, v0.s[2]\n"
-      "ldr q11, [%x[B_ptr], #0x0]\n"
-      "fmla v29.4s, v8.4s, v0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ldr q12, [%x[B_ptr], #0x10]\n"
-      "fmla v30.4s, v9.4s, v0.s[2]\n"
-      "ldr q13, [%x[B_ptr], #0x20]\n"
-      "ldr q14, [%x[B_ptr], #0x30]\n"
-      "fmla v31.4s, v10.4s, v0.s[2]\n"
-      "ldr q15, [%x[B_ptr], #0x40]\n"
-      "fmla v24.4s, v11.4s, v0.s[3]\n"
-      "ldr q16, [%x[B_ptr], #0x50]\n"
-      "ldr q17, [%x[B_ptr], #0x60]\n"
-      "fmla v25.4s, v12.4s, v0.s[3]\n"
-      "ldr q18, [%x[B_ptr], #0x70]\n"
-      "fmla v26.4s, v13.4s, v0.s[3]\n"
-      "fmla v27.4s, v14.4s, v0.s[3]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla v28.4s, v15.4s, v0.s[3]\n"
-      "add x19, x19, #0x10\n"
-      "fmla v29.4s, v16.4s, v0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla v30.4s, v17.4s, v0.s[3]\n"
-      "prfm pldl1keep, [x19, #0x80]\n"
-      "fmla v31.4s, v18.4s, v0.s[3]\n"
-      "90:"  // Width 8: Multiply loop: Main loop skip
-      "cbz x20, 92f\n"
-      "91:"  // Width 8: Multiply loop: Odd block loop
-      "ldr s0, [x19], #0x4\n"
-      "ldr q19, [%x[B_ptr], #0x0]\n"
-      "fmla v24.4s, v19.4s, v0.s[0]\n"
-      "ldr q20, [%x[B_ptr], #0x10]\n"
-      "ldr q21, [%x[B_ptr], #0x20]\n"
-      "fmla v25.4s, v20.4s, v0.s[0]\n"
-      "ldr q22, [%x[B_ptr], #0x30]\n"
-      "fmla v26.4s, v21.4s, v0.s[0]\n"
-      "ldr q23, [%x[B_ptr], #0x40]\n"
-      "ldr q1, [%x[B_ptr], #0x50]\n"
-      "fmla v27.4s, v22.4s, v0.s[0]\n"
-      "ldr q2, [%x[B_ptr], #0x60]\n"
-      "ldr q3, [%x[B_ptr], #0x70]\n"
-      "fmla v28.4s, v23.4s, v0.s[0]\n"
-      "fmla v29.4s, v1.4s, v0.s[0]\n"
-      "add %x[B_ptr], %x[B_ptr], #0x80\n"
-      "sub x20, x20, #0x1\n"
-      "fmla v30.4s, v2.4s, v0.s[0]\n"
-      "fmla v31.4s, v3.4s, v0.s[0]\n"
-      "cbnz x20, 91b\n"
-      "92:"  // Width 8: Multiply loop: No odd multiplies
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 93f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1r { v17.4s }, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1r { v16.4s }, [x19]\n"
-      "fmin v24.4s, v24.4s, v16.4s\n"
-      "fmin v25.4s, v25.4s, v16.4s\n"
-      "fmin v26.4s, v26.4s, v16.4s\n"
-      "fmin v27.4s, v27.4s, v16.4s\n"
-      "fmax v24.4s, v24.4s, v17.4s\n"
-      "fmax v25.4s, v25.4s, v17.4s\n"
-      "fmax v26.4s, v26.4s, v17.4s\n"
-      "fmax v27.4s, v27.4s, v17.4s\n"
-      "fmin v28.4s, v28.4s, v16.4s\n"
-      "fmin v29.4s, v29.4s, v16.4s\n"
-      "fmin v30.4s, v30.4s, v16.4s\n"
-      "fmax v28.4s, v28.4s, v17.4s\n"
-      "fmax v29.4s, v29.4s, v17.4s\n"
-      "fmax v30.4s, v30.4s, v17.4s\n"
-      "fmin v31.4s, v31.4s, v16.4s\n"
-      "fmax v31.4s, v31.4s, v17.4s\n"
-      "93:"  // Width 8: No activation
-      "str q24, [%x[output_ptr], #0x0]\n"
-      "str q25, [%x[output_ptr], #0x10]\n"
-      "str q26, [%x[output_ptr], #0x20]\n"
-      "str q27, [%x[output_ptr], #0x30]\n"
-      "str q28, [%x[output_ptr], #0x40]\n"
-      "str q29, [%x[output_ptr], #0x50]\n"
-      "str q30, [%x[output_ptr], #0x60]\n"
-      "cmp %x[N], #0x20\n"
-      "add %x[output_ptr], %x[output_ptr], #0x70\n"
-      "blt 94f\n"
-      "str q31, [%x[output_ptr], #0x0]\n"
-      "add %x[output_ptr], %x[output_ptr], #0x10\n"
-      "b 96f\n"
-      "94:"  // Width 8: Partial writeback
-      "tbz %x[N], #1, 95f\n"
-      "str d31, [%x[output_ptr]], #0x8\n"
-      "tbz %x[N], #0, 96f\n"
-      "st1 { v31.s }[2], [%x[output_ptr]]\n"
-      "b 96f\n"
-      "95:"  // Width 8: Partial direct writeback: partial_1_28
-      "tbz %x[N], #0, 96f\n"
-      "str s31, [%x[output_ptr], #0x0]\n"
-      "96:"  // Width 8: Writeback done
-      "subs x22, x22, #0x8\n"
-      "sub %x[N], %x[N], #0x20\n"
-      "bgt 1b\n"
-      "97:"  // Exit
-
-      : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
-      : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
-    );
-}
-
-} // namespace arm_gemm
-
-#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
index cccedc6b9c..586d6a64a4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,6 +62,7 @@ public:
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 24> transforms = {};
 
+    template<typename T>
     static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
         switch (ci->get_cpu_model()) {
             case CPUModel::A55r1:
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
index 29cdd33893..e5728beba8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
@@ -71,10 +71,6 @@ void a64_hgemm_asimd_8x24_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp
             register float16x8_t b2  asm("v6");
 
             __asm __volatile (
-                // Enable FP16 instruction support (but only if it's not already on).
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                ".arch	armv8.2-a+fp16\n"
-#endif
                 // Initialize result registers, load initial operands, prime prefetches.
                 "movi	v8.8h, #0x0\n"
                 "ldr	%d[a0], [%[a_ptr]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
index c9c48dd1c0..23b87fa192 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
@@ -66,10 +66,6 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
             register float16x8_t b2a asm("v7");
 
             __asm __volatile (
-                // Enable FP16 instruction support (but only if it's not already on).
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                ".arch	armv8.2-a+fp16\n"
-#endif
                 // Initialize result registers, load initial operands, prime prefetches.
                 "movi	v8.8h, #0x0\n"
                 "ldr	%q[a0], [%[a_ptr]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
index a6d2405e7e..b47fa6a2d7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
@@ -63,10 +63,6 @@ void a64_hgemm_asimd_8x24_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16
             register float16x8_t b2  asm("v4");
 
             __asm __volatile (
-                // Enable FP16 instruction support (but only if it's not already on).
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                ".arch	armv8.2-a+fp16\n"
-#endif
                 // Initialize result registers, load initial operands, prime prefetches.
                 "movi	v8.8h, #0x0\n"
                 "ldr	%q[a0], [%[a_ptr]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
index fca96f6028..3b8770e153 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
@@ -22,10 +22,11 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef __aarch64__
 
+#ifdef __aarch64__
 #include "../std_transforms_fixed.hpp"
 #include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -43,7 +44,8 @@ void a64_hybrid_bf16fp32_dot_6x16( ARGLIST );
 class cls_a64_hybrid_bf16fp32_dot_6x16
 {
 public:
-    typedef bfloat16 operand_type;
+    typedef bfloat16 lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
     typedef float result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -69,7 +71,23 @@ public:
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 6, 16, 2> transforms = {};
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 2> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 15.83 };
+                case CPUModel::A510:
+                    return { 7.28 };
+                case CPUModel::V1:
+                    return { 27.34 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=a64_hybrid_bf16fp32_dot_6x16;
@@ -81,4 +99,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
index afb06dedea..27e08135b6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
@@ -1988,8 +1988,8 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       "ld1 { v22.4s }, [x22], #0x10\n"
       "ld1 { v26.4s }, [x21], #0x10\n"
       "tbz x11, #1, 144f\n"
-      "mov x19, #0x38\n"
       "ldr d11, [x28], #0x8\n"
+      "mov x19, #0x38\n"
       "ldr d15, [x24], #0x8\n"
       "ldr d19, [x23], #0x8\n"
       "ldr d23, [x22], #0x8\n"
@@ -2042,8 +2042,8 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       "ld1 { v20.4s }, [x22], #0x10\n"
       "ld1 { v24.4s }, [x21], #0x10\n"
       "tbz x11, #1, 148f\n"
-      "mov x19, #0x18\n"
       "ldr d9, [x28], #0x8\n"
+      "mov x19, #0x18\n"
       "ldr d13, [x24], #0x8\n"
       "ldr d17, [x23], #0x8\n"
       "ldr d21, [x22], #0x8\n"
@@ -2717,12 +2717,12 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       "ld1 { v16.4s }, [x23], #0x10\n"
       "ld1 { v20.4s }, [x22], #0x10\n"
       "ld1 { v24.4s }, [x21], #0x10\n"
-      "ld1 { v28.4s }, [x20], #0x10\n"
       "ld1 { v9.4s }, [x28], #0x10\n"
       "ld1 { v13.4s }, [x24], #0x10\n"
       "ld1 { v17.4s }, [x23], #0x10\n"
       "ld1 { v21.4s }, [x22], #0x10\n"
       "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
       "ld1 { v29.4s }, [x20], #0x10\n"
       "tbz x11, #2, 180f\n"
       "ld1 { v10.4s }, [x28], #0x10\n"
@@ -2732,8 +2732,8 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       "ld1 { v26.4s }, [x21], #0x10\n"
       "ld1 { v30.4s }, [x20], #0x10\n"
       "tbz x11, #1, 179f\n"
-      "mov x19, #0x38\n"
       "ldr d11, [x28], #0x8\n"
+      "mov x19, #0x38\n"
       "ldr d15, [x24], #0x8\n"
       "ldr d19, [x23], #0x8\n"
       "ldr d23, [x22], #0x8\n"
@@ -2793,8 +2793,8 @@ void a64_hybrid_bf16fp32_dot_6x16 (
       "ld1 { v24.4s }, [x21], #0x10\n"
       "ld1 { v28.4s }, [x20], #0x10\n"
       "tbz x11, #1, 183f\n"
-      "mov x19, #0x18\n"
       "ldr d9, [x28], #0x8\n"
+      "mov x19, #0x18\n"
       "ldr d13, [x24], #0x8\n"
       "ldr d17, [x23], #0x8\n"
       "ldr d21, [x22], #0x8\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp
index b53172509e..8cb743b777 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,73 +10,94 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
 
 #ifdef __aarch64__
-
-#include "../performance_parameters.hpp"
 #include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<bfloat16>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
 
 namespace arm_gemm
 {
-
 // Actual kernel implementations
-void a64_gemv_fp32_mla_32(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
+void a64_hybrid_bf16fp32_mmla_6x16( ARGLIST );
 
-class cls_a64_gemv_fp32_mla_32
+class cls_a64_hybrid_bf16fp32_mmla_6x16
 {
 public:
-    typedef float operand_type;
+    typedef bfloat16 lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
+    typedef void (*kern_type)( ARGLIST );
 
-    static unsigned int out_width()
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
     {
-        return 32;
+        return 6;
     }
 
-    static constexpr unsigned int k_unroll()
+    static unsigned int out_width()
     {
-        return 1;
+        return 16;
     }
 
-    static constexpr bool supports_accumulate()
+    static constexpr unsigned int k_unroll()
     {
-        return false;
+        return 4;
     }
 
-    static constexpr bool supports_bias()
+    static constexpr bool supports_accumulate()
     {
         return true;
     }
 
-    static constexpr bool supports_activation()
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        return true;
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 25.04 };
+                case CPUModel::A510:
+                    return { 7.27 };
+                case CPUModel::V1:
+                    return { 40.09 };
+            }
+        }
+
+        return { 1.0 };
     }
 
-    StdTransformsFixed<operand_type, result_type, 1, 32, 1> transforms = {};
-
     // Default to the generic kernel
-    kern_type kernel=a64_gemv_fp32_mla_32;
-
-    cls_a64_gemv_fp32_mla_32(const CPUInfo *)
+    kern_type kernel=a64_hybrid_bf16fp32_mmla_6x16;
+    cls_a64_hybrid_bf16fp32_mmla_6x16(const CPUInfo *)
     {
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp
new file mode 100644
index 0000000000..0fa358e848
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp
@@ -0,0 +1,3725 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_hybrid_bf16fp32_mmla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 186f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 149f\n"
+      "beq 112f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 75f\n"
+      "beq 38f\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[bias]\n"
+      "mov x28, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "cbz x9, 3f\n"
+      "ldr q8, [x9, #0x0]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x9, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "add x9, x9, #0x40\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "b 15f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 14f\n"
+      "cmp x11, #0x10\n"
+      "bge 12f\n"
+      "tbz x11, #3, 7f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "tbz x11, #2, 5f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "tbz x11, #1, 4f\n"
+      "mov x19, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "b 11f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x11, #0, 11f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "b 11f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x11, #1, 6f\n"
+      "ldr d11, [x28], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "b 11f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x11, #0, 11f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "b 11f\n"
+      "7:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x11, #2, 9f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "tbz x11, #1, 8f\n"
+      "ldr d10, [x28], #0x8\n"
+      "mov x19, #0x18\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "b 11f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x11, #0, 11f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "b 11f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x11, #1, 10f\n"
+      "ldr d9, [x28], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "b 11f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x19, #0x0\n"
+      "11:"  // Height 1: Partial accumulate: Done
+      "sub x28, x28, x19\n"
+      "b 13f\n"
+      "12:"  // Height 1: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "13:"  // Height 1: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 15f\n"
+      "14:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "15:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "16:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 17f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 18f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "b 18f\n"
+      "17:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "18:"  // Height 1: input setup done
+      "cmp x26, #0x8\n"
+      "blt 21f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x26, #0x10\n"
+      "blt 20f\n"
+      "19:"  // Height 1: Multiply loop: Main loop head
+      "movi v2.16b, #0x0\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      "sub x26, x26, #0x8\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "cmp x26, #0x10\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      "ldr q7, [x10, #0xa0]\n"
+      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      "ldr q6, [x10, #0xb0]\n"
+      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      "ldr q7, [x10, #0xc0]\n"
+      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      "ldr q6, [x10, #0xd0]\n"
+      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      "ldr q7, [x10, #0xe0]\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      "ldr q1, [x25, #0x0]\n"
+      "bge 19b\n"
+      "20:"  // Height 1: Multiply loop: Single iteration only
+      "movi v2.16b, #0x0\n"
+      "ldr q7, [x10, #0x0]\n"
+      "sub x26, x26, #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      "ldr q7, [x10, #0xa0]\n"
+      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      "ldr q6, [x10, #0xb0]\n"
+      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      "ldr q7, [x10, #0xc0]\n"
+      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      "ldr q6, [x10, #0xd0]\n"
+      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      "ldr q7, [x10, #0xe0]\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      "21:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x26, 26f\n"
+      "cmp x26, #0x4\n"
+      "blt 23f\n"
+      "22:"  // Height 1: Multiply loop: Odd block loop
+      "movi v2.16b, #0x0\n"
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x4\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x10, #0x0]\n"
+      "cmp x26, #0x4\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q7, [x10, #0x10]\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "bge 22b\n"
+      "cbz x26, 26f\n"
+      "23:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x26, #1, 24f\n"
+      "ldr s1, [x25], #0x4\n"
+      "tbz x26, #0, 25f\n"
+      "ld1 { v1.h }[2], [x25]\n"
+      "b 25f\n"
+      "24:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x25, #0x0]\n"
+      "25:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "movi v2.16b, #0x0\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      "26:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 16b\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "27:"  // Height 1: No activation
+      "uzp1 v8.2d, v8.2d, v12.2d\n"
+      "cmp x11, #0x10\n"
+      "uzp1 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v11.2d, v11.2d, v15.2d\n"
+      "bge 36f\n"
+      "tbz x11, #3, 31f\n"
+      "st1 { v8.4s }, [x28], #0x10\n"
+      "st1 { v9.4s }, [x28], #0x10\n"
+      "tbz x11, #2, 29f\n"
+      "st1 { v10.4s }, [x28], #0x10\n"
+      "tbz x11, #1, 28f\n"
+      "str d11, [x28], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v11.s }[2], [x28]\n"
+      "b 35f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 35f\n"
+      "str s11, [x28, #0x0]\n"
+      "b 35f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 30f\n"
+      "str d10, [x28], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v10.s }[2], [x28]\n"
+      "b 35f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 35f\n"
+      "str s10, [x28, #0x0]\n"
+      "b 35f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 33f\n"
+      "st1 { v8.4s }, [x28], #0x10\n"
+      "tbz x11, #1, 32f\n"
+      "str d9, [x28], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v9.s }[2], [x28]\n"
+      "b 35f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 35f\n"
+      "str s9, [x28, #0x0]\n"
+      "b 35f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 34f\n"
+      "str d8, [x28], #0x8\n"
+      "tbz x11, #0, 35f\n"
+      "st1 { v8.s }[2], [x28]\n"
+      "b 35f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x28, #0x0]\n"
+      "35:"  // Height 1: Partial direct writeback: Done
+      "b 37f\n"
+      "36:"  // Height 1: Full writeback
+      "str q8, [x28, #0x0]\n"
+      "str q9, [x28, #0x10]\n"
+      "str q10, [x28, #0x20]\n"
+      "str q11, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "37:"  // Height 1: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 2b\n"
+      "b 224f\n"
+      "38:"  // Height 2
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "39:"  // Height 2: Column loop
+      "cbz x9, 40f\n"
+      "ldr q8, [x9, #0x0]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x9, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "add x9, x9, #0x40\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "b 52f\n"
+      "40:"  // Height 2: no bias
+      "tbz %x[flags], #0, 51f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x28, x19, LSL #2\n"
+      "bge 49f\n"
+      "tbz x11, #3, 44f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 42f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 41f\n"
+      "mov x19, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "tbz x11, #0, 48f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "b 48f\n"
+      "41:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x11, #0, 48f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "b 48f\n"
+      "42:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x11, #1, 43f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x11, #0, 48f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "b 48f\n"
+      "43:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x11, #0, 48f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "b 48f\n"
+      "44:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x11, #2, 46f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 45f\n"
+      "mov x19, #0x18\n"
+      "ldr d10, [x28], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "tbz x11, #0, 48f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "b 48f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x11, #0, 48f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "b 48f\n"
+      "46:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x11, #1, 47f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x11, #0, 48f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "b 48f\n"
+      "47:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "48:"  // Height 2: Partial accumulate: Done
+      "sub x28, x28, x19\n"
+      "b 50f\n"
+      "49:"  // Height 2: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "50:"  // Height 2: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 52f\n"
+      "51:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "52:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "53:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 54f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 55f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "b 55f\n"
+      "54:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "55:"  // Height 2: input setup done
+      "cmp x26, #0x8\n"
+      "blt 58f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "cmp x26, #0x10\n"
+      "blt 57f\n"
+      "56:"  // Height 2: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      "sub x26, x26, #0x8\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      "cmp x26, #0x10\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      "ldr q7, [x10, #0xa0]\n"
+      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      "ldr q6, [x10, #0xb0]\n"
+      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      "ldr q7, [x10, #0xc0]\n"
+      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      "ldr q6, [x10, #0xd0]\n"
+      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      "ldr q7, [x10, #0xe0]\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      "ldr q1, [x25, #0x0]\n"
+      "bge 56b\n"
+      "57:"  // Height 2: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x10, #0x0]\n"
+      "sub x26, x26, #0x8\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      "ldr q7, [x10, #0xa0]\n"
+      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      "ldr q6, [x10, #0xb0]\n"
+      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      "ldr q7, [x10, #0xc0]\n"
+      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      "ldr q6, [x10, #0xd0]\n"
+      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      "ldr q7, [x10, #0xe0]\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      "58:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x26, 63f\n"
+      "cmp x26, #0x4\n"
+      "blt 60f\n"
+      "59:"  // Height 2: Multiply loop: Odd block loop
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x4\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x10, #0x0]\n"
+      "cmp x26, #0x4\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q7, [x10, #0x10]\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "bge 59b\n"
+      "cbz x26, 63f\n"
+      "60:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x26, #1, 61f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "tbz x26, #0, 62f\n"
+      "ld1 { v1.h }[2], [x25]\n"
+      "ld1 { v2.h }[2], [x24]\n"
+      "b 62f\n"
+      "61:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x25, #0x0]\n"
+      "ldr h2, [x24, #0x0]\n"
+      "62:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x10, #0x0]\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      "63:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 53b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "tbz %x[flags], #1, 64f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "64:"  // Height 2: No activation
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "cmp x11, #0x10\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "bge 73f\n"
+      "tbz x11, #3, 68f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 66f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 65f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "b 72f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 72f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "b 72f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 67f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "b 72f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 72f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "b 72f\n"
+      "68:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 70f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 69f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "b 72f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 72f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "b 72f\n"
+      "70:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 71f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "tbz x11, #0, 72f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "b 72f\n"
+      "71:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "72:"  // Height 2: Partial direct writeback: Done
+      "b 74f\n"
+      "73:"  // Height 2: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "74:"  // Height 2: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 39b\n"
+      "b 224f\n"
+      "75:"  // Height 3
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "76:"  // Height 3: Column loop
+      "cbz x9, 77f\n"
+      "ldr q8, [x9, #0x0]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x9, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x9, #0x20]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "mov v20.16b, v12.16b\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "b 89f\n"
+      "77:"  // Height 3: no bias
+      "tbz %x[flags], #0, 88f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "bge 86f\n"
+      "tbz x11, #3, 81f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 79f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 78f\n"
+      "mov x19, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "tbz x11, #0, 85f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "b 85f\n"
+      "78:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x11, #0, 85f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "b 85f\n"
+      "79:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x11, #1, 80f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "tbz x11, #0, 85f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "b 85f\n"
+      "80:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x11, #0, 85f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "b 85f\n"
+      "81:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x11, #2, 83f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 82f\n"
+      "mov x19, #0x18\n"
+      "ldr d10, [x28], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "tbz x11, #0, 85f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "b 85f\n"
+      "82:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x11, #0, 85f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "b 85f\n"
+      "83:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x11, #1, 84f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "tbz x11, #0, 85f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "b 85f\n"
+      "84:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "85:"  // Height 3: Partial accumulate: Done
+      "sub x28, x28, x19\n"
+      "b 87f\n"
+      "86:"  // Height 3: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "87:"  // Height 3: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 89f\n"
+      "88:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "89:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "90:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 91f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 92f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "b 92f\n"
+      "91:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "92:"  // Height 3: input setup done
+      "cmp x26, #0x8\n"
+      "blt 95f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x26, #0x10\n"
+      "blt 94f\n"
+      "93:"  // Height 3: Multiply loop: Main loop head
+      "movi v4.16b, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      "sub x26, x26, #0x8\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "cmp x26, #0x10\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      "ldr q7, [x10, #0xa0]\n"
+      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
+      "ldr q6, [x10, #0xb0]\n"
+      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
+      "ldr q7, [x10, #0xc0]\n"
+      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
+      "ldr q6, [x10, #0xd0]\n"
+      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
+      "ldr q7, [x10, #0xe0]\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      "bge 93b\n"
+      "94:"  // Height 3: Multiply loop: Single iteration only
+      "movi v4.16b, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "sub x26, x26, #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      "ldr q7, [x10, #0xa0]\n"
+      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
+      "ldr q6, [x10, #0xb0]\n"
+      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
+      "ldr q7, [x10, #0xc0]\n"
+      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
+      "ldr q6, [x10, #0xd0]\n"
+      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
+      "ldr q7, [x10, #0xe0]\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      "95:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x26, 100f\n"
+      "cmp x26, #0x4\n"
+      "blt 97f\n"
+      "96:"  // Height 3: Multiply loop: Odd block loop
+      "movi v4.16b, #0x0\n"
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x4\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x4\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      "bge 96b\n"
+      "cbz x26, 100f\n"
+      "97:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x26, #1, 98f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "tbz x26, #0, 99f\n"
+      "ld1 { v1.h }[2], [x25]\n"
+      "ld1 { v2.h }[2], [x24]\n"
+      "ld1 { v3.h }[2], [x23]\n"
+      "b 99f\n"
+      "98:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x25, #0x0]\n"
+      "ldr h2, [x24, #0x0]\n"
+      "ldr h3, [x23, #0x0]\n"
+      "99:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "movi v4.16b, #0x0\n"
+      "ldr q7, [x10, #0x0]\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "100:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 90b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "add x23, x24, x19, LSL #2\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 101f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "101:"  // Height 3: No activation
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "cmp x11, #0x10\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "bge 110f\n"
+      "tbz x11, #3, 105f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 103f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 102f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "b 109f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 109f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "b 109f\n"
+      "103:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 104f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "b 109f\n"
+      "104:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 109f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "b 109f\n"
+      "105:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 107f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 106f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "b 109f\n"
+      "106:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 109f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "b 109f\n"
+      "107:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 108f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x11, #0, 109f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "b 109f\n"
+      "108:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "109:"  // Height 3: Partial direct writeback: Done
+      "b 111f\n"
+      "110:"  // Height 3: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "111:"  // Height 3: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 76b\n"
+      "b 224f\n"
+      "112:"  // Height 4
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "113:"  // Height 4: Column loop
+      "cbz x9, 114f\n"
+      "ldr q8, [x9, #0x0]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x9, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x9, #0x20]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "mov v20.16b, v12.16b\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "b 126f\n"
+      "114:"  // Height 4: no bias
+      "tbz %x[flags], #0, 125f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "bge 123f\n"
+      "tbz x11, #3, 118f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 116f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 115f\n"
+      "mov x19, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "tbz x11, #0, 122f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "b 122f\n"
+      "115:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x11, #0, 122f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "b 122f\n"
+      "116:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x11, #1, 117f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "tbz x11, #0, 122f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "b 122f\n"
+      "117:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x11, #0, 122f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "b 122f\n"
+      "118:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x11, #2, 120f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 119f\n"
+      "mov x19, #0x18\n"
+      "ldr d10, [x28], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "tbz x11, #0, 122f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "b 122f\n"
+      "119:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x11, #0, 122f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "b 122f\n"
+      "120:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x11, #1, 121f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "tbz x11, #0, 122f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "b 122f\n"
+      "121:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "122:"  // Height 4: Partial accumulate: Done
+      "sub x28, x28, x19\n"
+      "b 124f\n"
+      "123:"  // Height 4: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "124:"  // Height 4: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 126f\n"
+      "125:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "126:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "127:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 128f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 129f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "b 129f\n"
+      "128:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "129:"  // Height 4: input setup done
+      "cmp x26, #0x8\n"
+      "blt 132f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "cmp x26, #0x10\n"
+      "blt 131f\n"
+      "130:"  // Height 4: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "sub x26, x26, #0x8\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      "cmp x26, #0x10\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x90]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      "ldr q7, [x10, #0xa0]\n"
+      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
+      "ldr q6, [x10, #0xb0]\n"
+      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
+      "ldr q7, [x10, #0xc0]\n"
+      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
+      "ldr q6, [x10, #0xd0]\n"
+      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
+      "ldr q7, [x10, #0xe0]\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      "bge 130b\n"
+      "131:"  // Height 4: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "sub x26, x26, #0x8\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      "ldr q7, [x10, #0xa0]\n"
+      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
+      "ldr q6, [x10, #0xb0]\n"
+      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
+      "ldr q7, [x10, #0xc0]\n"
+      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
+      "ldr q6, [x10, #0xd0]\n"
+      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
+      "ldr q7, [x10, #0xe0]\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      "132:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x26, 137f\n"
+      "cmp x26, #0x4\n"
+      "blt 134f\n"
+      "133:"  // Height 4: Multiply loop: Odd block loop
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x4\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x4\n"
+      "ldr d4, [x22], #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      "bge 133b\n"
+      "cbz x26, 137f\n"
+      "134:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x26, #1, 135f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "tbz x26, #0, 136f\n"
+      "ld1 { v1.h }[2], [x25]\n"
+      "ld1 { v2.h }[2], [x24]\n"
+      "ld1 { v3.h }[2], [x23]\n"
+      "ld1 { v4.h }[2], [x22]\n"
+      "b 136f\n"
+      "135:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x25, #0x0]\n"
+      "ldr h2, [x24, #0x0]\n"
+      "ldr h3, [x23, #0x0]\n"
+      "ldr h4, [x22, #0x0]\n"
+      "136:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x10, #0x0]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "137:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 127b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "add x23, x24, x19, LSL #2\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "tbz %x[flags], #1, 138f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "138:"  // Height 4: No activation
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "cmp x11, #0x10\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "bge 147f\n"
+      "tbz x11, #3, 142f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 140f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 139f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "b 146f\n"
+      "139:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 146f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "b 146f\n"
+      "140:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 141f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "b 146f\n"
+      "141:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 146f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "b 146f\n"
+      "142:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 144f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 143f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "b 146f\n"
+      "143:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 146f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "b 146f\n"
+      "144:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 145f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "tbz x11, #0, 146f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "b 146f\n"
+      "145:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "146:"  // Height 4: Partial direct writeback: Done
+      "b 148f\n"
+      "147:"  // Height 4: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q15, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q22, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "148:"  // Height 4: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 113b\n"
+      "b 224f\n"
+      "149:"  // Height 5
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "150:"  // Height 5: Column loop
+      "cbz x9, 151f\n"
+      "ldr q8, [x9, #0x0]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x9, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x9, #0x20]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "mov v28.16b, v12.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v29.16b, v13.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v30.16b, v14.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v31.16b, v15.16b\n"
+      "b 163f\n"
+      "151:"  // Height 5: no bias
+      "tbz %x[flags], #0, 162f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "bge 160f\n"
+      "tbz x11, #3, 155f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 153f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 152f\n"
+      "ldr d16, [x28], #0x8\n"
+      "mov x19, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d6, [x21], #0x8\n"
+      "tbz x11, #0, 159f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v6.s }[2], [x21]\n"
+      "b 159f\n"
+      "152:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x11, #0, 159f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s6, [x21, #0x0]\n"
+      "b 159f\n"
+      "153:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x11, #1, 154f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "tbz x11, #0, 159f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "b 159f\n"
+      "154:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x11, #0, 159f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "b 159f\n"
+      "155:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x11, #2, 157f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 156f\n"
+      "ldr d10, [x28], #0x8\n"
+      "mov x19, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "tbz x11, #0, 159f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "b 159f\n"
+      "156:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x11, #0, 159f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "b 159f\n"
+      "157:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x11, #1, 158f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "tbz x11, #0, 159f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "b 159f\n"
+      "158:"  // Height 5: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "159:"  // Height 5: Partial accumulate: Done
+      "sub x28, x28, x19\n"
+      "b 161f\n"
+      "160:"  // Height 5: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q25, [x21, #0x0]\n"
+      "ldr q26, [x21, #0x10]\n"
+      "ldr q27, [x21, #0x20]\n"
+      "ldr q6, [x21, #0x30]\n"
+      "161:"  // Height 5: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 163f\n"
+      "162:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "163:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "164:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 165f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 166f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x21, x21, x19, LSL #1\n"
+      "b 166f\n"
+      "165:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "add x21, x22, x19, LSL #1\n"
+      "166:"  // Height 5: input setup done
+      "cmp x26, #0x8\n"
+      "blt 169f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x26, #0x10\n"
+      "blt 168f\n"
+      "167:"  // Height 5: Multiply loop: Main loop head
+      "movi v6.16b, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "sub x26, x26, #0x8\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      "cmp x26, #0x10\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
+      "ldr q7, [x10, #0xa0]\n"
+      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbc  // bfmmla v28.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xb0]\n"
+      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb9  // bfmmla v25.4s, v5.8h, v7.8h\n"
+      "ldr q7, [x10, #0xc0]\n"
+      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbd  // bfmmla v29.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xd0]\n"
+      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecba  // bfmmla v26.4s, v5.8h, v7.8h\n"
+      "ldr q7, [x10, #0xe0]\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbe  // bfmmla v30.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecbb  // bfmmla v27.4s, v5.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
+      "bge 167b\n"
+      "168:"  // Height 5: Multiply loop: Single iteration only
+      "movi v6.16b, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "sub x26, x26, #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
+      "ldr q7, [x10, #0xa0]\n"
+      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbc  // bfmmla v28.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xb0]\n"
+      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb9  // bfmmla v25.4s, v5.8h, v7.8h\n"
+      "ldr q7, [x10, #0xc0]\n"
+      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbd  // bfmmla v29.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xd0]\n"
+      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecba  // bfmmla v26.4s, v5.8h, v7.8h\n"
+      "ldr q7, [x10, #0xe0]\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbe  // bfmmla v30.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecbb  // bfmmla v27.4s, v5.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
+      "169:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x26, 174f\n"
+      "cmp x26, #0x4\n"
+      "blt 171f\n"
+      "170:"  // Height 5: Multiply loop: Odd block loop
+      "movi v7.16b, #0x0\n"
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x4\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x4\n"
+      "ldr d4, [x22], #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d5, [x21], #0x8\n"
+      "ldr q6, [x10, #0x0]\n"
+      "trn1 v4.2d, v5.2d, v7.2d\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      "bge 170b\n"
+      "cbz x26, 174f\n"
+      "171:"  // Height 5: Multiply loop: Skip odd blocks
+      "tbz x26, #1, 172f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x21], #0x4\n"
+      "tbz x26, #0, 173f\n"
+      "ld1 { v1.h }[2], [x25]\n"
+      "ld1 { v2.h }[2], [x24]\n"
+      "ld1 { v3.h }[2], [x23]\n"
+      "ld1 { v4.h }[2], [x22]\n"
+      "ld1 { v5.h }[2], [x21]\n"
+      "b 173f\n"
+      "172:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x25, #0x0]\n"
+      "ldr h2, [x24, #0x0]\n"
+      "ldr h3, [x23, #0x0]\n"
+      "ldr h4, [x22, #0x0]\n"
+      "ldr h5, [x21, #0x0]\n"
+      "173:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "movi v6.16b, #0x0\n"
+      "ldr q7, [x10, #0x0]\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      "174:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 164b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "add x23, x24, x19, LSL #2\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 175f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmin v28.4s, v28.4s, v0.4s\n"
+      "fmin v29.4s, v29.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "fmax v28.4s, v28.4s, v1.4s\n"
+      "fmax v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v0.4s\n"
+      "fmin v31.4s, v31.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v1.4s\n"
+      "fmax v31.4s, v31.4s, v1.4s\n"
+      "175:"  // Height 5: No activation
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "cmp x11, #0x10\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "bge 184f\n"
+      "tbz x11, #3, 179f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 177f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 176f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "b 183f\n"
+      "176:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 183f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "b 183f\n"
+      "177:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 178f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "b 183f\n"
+      "178:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 183f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "b 183f\n"
+      "179:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 181f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 180f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "b 183f\n"
+      "180:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 183f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "b 183f\n"
+      "181:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 182f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x11, #0, 183f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "b 183f\n"
+      "182:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "183:"  // Height 5: Partial direct writeback: Done
+      "b 185f\n"
+      "184:"  // Height 5: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q15, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q22, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
+      "185:"  // Height 5: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 150b\n"
+      "b 224f\n"
+      "186:"  // Height 6
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x19, #0x18\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "187:"  // Height 6: Column loop
+      "cbz x9, 188f\n"
+      "ldr q8, [x9, #0x0]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x9, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x9, #0x20]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "mov v28.16b, v12.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v29.16b, v13.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v30.16b, v14.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v31.16b, v15.16b\n"
+      "b 200f\n"
+      "188:"  // Height 6: no bias
+      "tbz %x[flags], #0, 199f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "bge 197f\n"
+      "tbz x11, #3, 192f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "ld1 { v29.4s }, [x20], #0x10\n"
+      "tbz x11, #2, 190f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "ld1 { v30.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 189f\n"
+      "ldr d16, [x28], #0x8\n"
+      "mov x19, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d6, [x21], #0x8\n"
+      "ldr d31, [x20], #0x8\n"
+      "tbz x11, #0, 196f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v6.s }[2], [x21]\n"
+      "ld1 { v31.s }[2], [x20]\n"
+      "b 196f\n"
+      "189:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x11, #0, 196f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s6, [x21, #0x0]\n"
+      "ldr s31, [x20, #0x0]\n"
+      "b 196f\n"
+      "190:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x11, #1, 191f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "ldr d30, [x20], #0x8\n"
+      "tbz x11, #0, 196f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "ld1 { v30.s }[2], [x20]\n"
+      "b 196f\n"
+      "191:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x11, #0, 196f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "ldr s30, [x20, #0x0]\n"
+      "b 196f\n"
+      "192:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x11, #2, 194f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 193f\n"
+      "ldr d10, [x28], #0x8\n"
+      "mov x19, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d29, [x20], #0x8\n"
+      "tbz x11, #0, 196f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "ld1 { v29.s }[2], [x20]\n"
+      "b 196f\n"
+      "193:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x11, #0, 196f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "ldr s29, [x20, #0x0]\n"
+      "b 196f\n"
+      "194:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x11, #1, 195f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "ldr d28, [x20], #0x8\n"
+      "tbz x11, #0, 196f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "ld1 { v28.s }[2], [x20]\n"
+      "b 196f\n"
+      "195:"  // Height 6: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "ldr s28, [x20, #0x0]\n"
+      "196:"  // Height 6: Partial accumulate: Done
+      "sub x28, x28, x19\n"
+      "b 198f\n"
+      "197:"  // Height 6: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q25, [x21, #0x0]\n"
+      "ldr q26, [x21, #0x10]\n"
+      "ldr q27, [x21, #0x20]\n"
+      "ldr q6, [x21, #0x30]\n"
+      "ldr q28, [x20, #0x0]\n"
+      "ldr q29, [x20, #0x10]\n"
+      "ldr q30, [x20, #0x20]\n"
+      "ldr q31, [x20, #0x30]\n"
+      "198:"  // Height 6: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 200f\n"
+      "199:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "200:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "201:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 202f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 203f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x21, x21, x19, LSL #1\n"
+      "add x20, x20, x19, LSL #1\n"
+      "b 203f\n"
+      "202:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "add x21, x22, x19, LSL #1\n"
+      "add x20, x21, x19, LSL #1\n"
+      "203:"  // Height 6: input setup done
+      "cmp x26, #0x8\n"
+      "blt 206f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "cmp x26, #0x10\n"
+      "blt 205f\n"
+      "204:"  // Height 6: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x20, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "sub x26, x26, #0x8\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x26, #0x10\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
+      "ldr q7, [x10, #0xa0]\n"
+      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbc  // bfmmla v28.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xb0]\n"
+      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb9  // bfmmla v25.4s, v5.8h, v7.8h\n"
+      "ldr q7, [x10, #0xc0]\n"
+      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbd  // bfmmla v29.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xd0]\n"
+      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecba  // bfmmla v26.4s, v5.8h, v7.8h\n"
+      "ldr q7, [x10, #0xe0]\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbe  // bfmmla v30.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecbb  // bfmmla v27.4s, v5.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
+      "bge 204b\n"
+      "205:"  // Height 6: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "sub x26, x26, #0x8\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x20, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q7, [x10, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x80]\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x90]\n"
+      ".inst 0x6e47ec28  // bfmmla v8.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec70  // bfmmla v16.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb8  // bfmmla v24.4s, v5.8h, v7.8h\n"
+      "ldr q7, [x10, #0xa0]\n"
+      ".inst 0x6e46ec2c  // bfmmla v12.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec74  // bfmmla v20.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbc  // bfmmla v28.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xb0]\n"
+      ".inst 0x6e47ec29  // bfmmla v9.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec71  // bfmmla v17.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecb9  // bfmmla v25.4s, v5.8h, v7.8h\n"
+      "ldr q7, [x10, #0xc0]\n"
+      ".inst 0x6e46ec2d  // bfmmla v13.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec75  // bfmmla v21.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbd  // bfmmla v29.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xd0]\n"
+      ".inst 0x6e47ec2a  // bfmmla v10.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec72  // bfmmla v18.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecba  // bfmmla v26.4s, v5.8h, v7.8h\n"
+      "ldr q7, [x10, #0xe0]\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec76  // bfmmla v22.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbe  // bfmmla v30.4s, v5.8h, v6.8h\n"
+      "ldr q6, [x10, #0xf0]\n"
+      "add x10, x10, #0x100\n"
+      ".inst 0x6e47ec2b  // bfmmla v11.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e47ec73  // bfmmla v19.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e47ecbb  // bfmmla v27.4s, v5.8h, v7.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec77  // bfmmla v23.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e46ecbf  // bfmmla v31.4s, v5.8h, v6.8h\n"
+      "206:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x26, 211f\n"
+      "cmp x26, #0x4\n"
+      "blt 208f\n"
+      "207:"  // Height 6: Multiply loop: Odd block loop
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x4\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x4\n"
+      "ldr d4, [x22], #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d5, [x21], #0x8\n"
+      "ldr d7, [x20], #0x8\n"
+      "trn1 v4.2d, v5.2d, v7.2d\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      "bge 207b\n"
+      "cbz x26, 211f\n"
+      "208:"  // Height 6: Multiply loop: Skip odd blocks
+      "tbz x26, #1, 209f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x21], #0x4\n"
+      "ldr s6, [x20], #0x4\n"
+      "tbz x26, #0, 210f\n"
+      "ld1 { v1.h }[2], [x25]\n"
+      "ld1 { v2.h }[2], [x24]\n"
+      "ld1 { v3.h }[2], [x23]\n"
+      "ld1 { v4.h }[2], [x22]\n"
+      "ld1 { v5.h }[2], [x21]\n"
+      "ld1 { v6.h }[2], [x20]\n"
+      "b 210f\n"
+      "209:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr h1, [x25, #0x0]\n"
+      "ldr h2, [x24, #0x0]\n"
+      "ldr h3, [x23, #0x0]\n"
+      "ldr h4, [x22, #0x0]\n"
+      "ldr h5, [x21, #0x0]\n"
+      "ldr h6, [x20, #0x0]\n"
+      "210:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x10, #0x0]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x10, #0x10]\n"
+      ".inst 0x6e47ec08  // bfmmla v8.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec50  // bfmmla v16.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec98  // bfmmla v24.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x20]\n"
+      ".inst 0x6e46ec0c  // bfmmla v12.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9c  // bfmmla v28.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x30]\n"
+      ".inst 0x6e47ec09  // bfmmla v9.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec51  // bfmmla v17.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec99  // bfmmla v25.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x40]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9d  // bfmmla v29.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x50]\n"
+      ".inst 0x6e47ec0a  // bfmmla v10.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec52  // bfmmla v18.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9a  // bfmmla v26.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x60]\n"
+      ".inst 0x6e46ec0e  // bfmmla v14.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9e  // bfmmla v30.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec53  // bfmmla v19.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9b  // bfmmla v27.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e46ec0f  // bfmmla v15.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9f  // bfmmla v31.4s, v4.8h, v6.8h\n"
+      "211:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 201b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "add x23, x24, x19, LSL #2\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "tbz %x[flags], #1, 212f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmin v28.4s, v28.4s, v0.4s\n"
+      "fmin v29.4s, v29.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "fmax v28.4s, v28.4s, v1.4s\n"
+      "fmax v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v0.4s\n"
+      "fmin v31.4s, v31.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v1.4s\n"
+      "fmax v31.4s, v31.4s, v1.4s\n"
+      "212:"  // Height 6: No activation
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "cmp x11, #0x10\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "bge 221f\n"
+      "tbz x11, #3, 216f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v24.4s }, [x20], #0x10\n"
+      "st1 { v25.4s }, [x20], #0x10\n"
+      "tbz x11, #2, 214f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "st1 { v26.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 213f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "str d27, [x20], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "st1 { v27.s }[2], [x20]\n"
+      "b 220f\n"
+      "213:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 220f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "str s27, [x20, #0x0]\n"
+      "b 220f\n"
+      "214:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 215f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "str d26, [x20], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "st1 { v26.s }[2], [x20]\n"
+      "b 220f\n"
+      "215:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 220f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "str s26, [x20, #0x0]\n"
+      "b 220f\n"
+      "216:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 218f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x21], #0x10\n"
+      "st1 { v24.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 217f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "str d25, [x20], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "st1 { v25.s }[2], [x20]\n"
+      "b 220f\n"
+      "217:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 220f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "str s25, [x20, #0x0]\n"
+      "b 220f\n"
+      "218:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 219f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "str d23, [x21], #0x8\n"
+      "str d24, [x20], #0x8\n"
+      "tbz x11, #0, 220f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "st1 { v23.s }[2], [x21]\n"
+      "st1 { v24.s }[2], [x20]\n"
+      "b 220f\n"
+      "219:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "str s23, [x21, #0x0]\n"
+      "str s24, [x20, #0x0]\n"
+      "220:"  // Height 6: Partial direct writeback: Done
+      "b 222f\n"
+      "221:"  // Height 6: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q15, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q22, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "str q23, [x21, #0x0]\n"
+      "str q28, [x21, #0x10]\n"
+      "str q29, [x21, #0x20]\n"
+      "str q30, [x21, #0x30]\n"
+      "str q24, [x20, #0x0]\n"
+      "str q25, [x20, #0x10]\n"
+      "str q26, [x20, #0x20]\n"
+      "str q27, [x20, #0x30]\n"
+      "222:"  // Height 6: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 187b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 224f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 223f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "223:"  // Update direct input
+      "mov x19, #0xc\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "224:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
index 674d71d626..4dd7556acd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
@@ -22,8 +22,8 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
 
+#ifdef __aarch64__
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -44,7 +44,8 @@ void a64_hybrid_fp16_mla_6x32_a55( ARGLIST );
 class cls_a64_hybrid_fp16_mla_6x32
 {
 public:
-    typedef __fp16 operand_type;
+    typedef __fp16 lhs_operand_type;
+    typedef __fp16 rhs_operand_type;
     typedef __fp16 result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,16 +71,24 @@ public:
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 6, 32, 1> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 32, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 6.94 };
-            default:
-                return { 14.53 };
+        if (std::is_same<T, __fp16>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 5.22 };
+                default:
+                    return { 14.53 };
+                case CPUModel::A510:
+                    return { 8.94 };
+                case CPUModel::V1:
+                    return { 29.26 };
+            }
         }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
@@ -99,4 +108,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
index 87c73740e7..9157d29eba 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp
@@ -92,9 +92,6 @@ void a64_hybrid_fp16_mla_6x32_a55 (
             break;
     }
     __asm__ __volatile__(
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-      ".arch  armv8.2-a+fp16\n"
-#endif
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 246f\n"
@@ -1305,14 +1302,14 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "ldr q9, [x16, #0x10]\n"
       "ldr q10, [x16, #0x20]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
-      "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
-      "mov v14.16b, v10.16b\n"
-      "mov v18.16b, v10.16b\n"
       "ldr q11, [x16, #0x30]\n"
+      "mov v13.16b, v9.16b\n"
       "add x16, x16, #0x40\n"
+      "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
       "b 120f\n"
       "101:"  // Height 3: no bias
@@ -2158,18 +2155,18 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "ldr q9, [x16, #0x10]\n"
       "ldr q10, [x16, #0x20]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x16, #0x30]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "add x16, x16, #0x40\n"
       "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
       "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
       "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
-      "mov v15.16b, v11.16b\n"
-      "mov v19.16b, v11.16b\n"
       "mov v23.16b, v11.16b\n"
       "b 169f\n"
       "150:"  // Height 4: no bias
@@ -3182,22 +3179,22 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "ldr q9, [x16, #0x10]\n"
       "ldr q10, [x16, #0x20]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x16, #0x30]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "add x16, x16, #0x40\n"
       "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
       "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
       "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
       "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
-      "mov v15.16b, v11.16b\n"
-      "mov v19.16b, v11.16b\n"
-      "mov v23.16b, v11.16b\n"
       "mov v27.16b, v11.16b\n"
       "b 218f\n"
       "199:"  // Height 5: no bias
@@ -4380,26 +4377,26 @@ void a64_hybrid_fp16_mla_6x32_a55 (
       "ldr q9, [x16, #0x10]\n"
       "ldr q10, [x16, #0x20]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x16, #0x30]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "add x16, x16, #0x40\n"
       "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
       "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
       "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
       "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
       "mov v28.16b, v8.16b\n"
       "mov v29.16b, v9.16b\n"
       "mov v30.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
-      "mov v15.16b, v11.16b\n"
-      "mov v19.16b, v11.16b\n"
-      "mov v23.16b, v11.16b\n"
-      "mov v27.16b, v11.16b\n"
       "mov v31.16b, v11.16b\n"
       "b 267f\n"
       "248:"  // Height 6: no bias
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
index 6e51773166..8877306f40 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
@@ -92,9 +92,6 @@ void a64_hybrid_fp16_mla_6x32 (
             break;
     }
     __asm__ __volatile__(
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-      ".arch  armv8.2-a+fp16\n"
-#endif
       "1:"  // Row loop
       "cmp %x[M], #0x6\n"
       "bge 246f\n"
@@ -4068,12 +4065,12 @@ void a64_hybrid_fp16_mla_6x32 (
       "ld1 { v16.8h }, [x23], #0x10\n"
       "ld1 { v20.8h }, [x22], #0x10\n"
       "ld1 { v24.8h }, [x21], #0x10\n"
-      "ld1 { v28.8h }, [x20], #0x10\n"
       "ld1 { v9.8h }, [x28], #0x10\n"
       "ld1 { v13.8h }, [x24], #0x10\n"
       "ld1 { v17.8h }, [x23], #0x10\n"
       "ld1 { v21.8h }, [x22], #0x10\n"
       "ld1 { v25.8h }, [x21], #0x10\n"
+      "ld1 { v28.8h }, [x20], #0x10\n"
       "ld1 { v29.8h }, [x20], #0x10\n"
       "tbz x11, #3, 252f\n"
       "ld1 { v10.8h }, [x28], #0x10\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp
new file mode 100644
index 0000000000..d68e4a22b5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24.hpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const float *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_fp32_mla_4x24( ARGLIST );
+void a64_hybrid_fp32_mla_4x24_a55( ARGLIST );
+
+class cls_a64_hybrid_fp32_mla_4x24
+{
+public:
+    typedef float lhs_operand_type;
+    typedef float rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return 24;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 4, 24, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 2.985 };
+                case CPUModel::A53:
+                    return { 1.43 };
+                case CPUModel::A73:
+                    return { 2.56 };
+                case CPUModel::A510:
+                    return { 3.51 };
+                case CPUModel::V1:
+                    return { 14.38 };
+                default:
+                    return { 6.614 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_fp32_mla_4x24;
+    cls_a64_hybrid_fp32_mla_4x24(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A55r1:
+            case CPUModel::A53:
+                kernel=a64_hybrid_fp32_mla_4x24_a55;
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp
new file mode 100644
index 0000000000..1fbc9232f0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp
@@ -0,0 +1,2807 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_4x24_a55 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 124f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 83f\n"
+      "beq 42f\n"
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x14, %x[bias]\n"
+      "mov x13, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "cbz x14, 3f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "ldr q11, [x14, #0x30]\n"
+      "ldr q12, [x14, #0x40]\n"
+      "ldr q13, [x14, #0x50]\n"
+      "add x14, x14, #0x60\n"
+      "b 18f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 17f\n"
+      "cmp x16, #0x18\n"
+      "bge 16f\n"
+      "tbz x16, #4, 7f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "tbz x16, #2, 5f\n"
+      "ld1 { v12.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 4f\n"
+      "mov x19, #0x58\n"
+      "ldr d13, [x13], #0x8\n"
+      "tbz x16, #0, 15f\n"
+      "ld1 { v13.s }[2], [x13]\n"
+      "b 15f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_20
+      "mov x19, #0x50\n"
+      "tbz x16, #0, 15f\n"
+      "ldr s13, [x13, #0x0]\n"
+      "b 15f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_16
+      "tbz x16, #1, 6f\n"
+      "ldr d12, [x13], #0x8\n"
+      "mov x19, #0x48\n"
+      "tbz x16, #0, 15f\n"
+      "ld1 { v12.s }[2], [x13]\n"
+      "b 15f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_16
+      "mov x19, #0x40\n"
+      "tbz x16, #0, 15f\n"
+      "ldr s12, [x13, #0x0]\n"
+      "b 15f\n"
+      "7:"  // Height 1: Partial accumulate: partial_8_0
+      "tbz x16, #3, 11f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "tbz x16, #2, 9f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 8f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "tbz x16, #0, 15f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "b 15f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 15f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "b 15f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x16, #1, 10f\n"
+      "ldr d10, [x13], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 15f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "b 15f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 15f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "b 15f\n"
+      "11:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x16, #2, 13f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 12f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "tbz x16, #0, 15f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "b 15f\n"
+      "12:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 15f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "b 15f\n"
+      "13:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x16, #1, 14f\n"
+      "ldr d8, [x13], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 15f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "b 15f\n"
+      "14:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s8, [x13, #0x0]\n"
+      "mov x19, #0x0\n"
+      "15:"  // Height 1: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 18f\n"
+      "16:"  // Height 1: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x13, #0x40]\n"
+      "ldr q13, [x13, #0x50]\n"
+      "b 18f\n"
+      "17:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "18:"  // Height 1: setup done
+      "mov x12, #0x0\n"
+      "19:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "cbnz x12, 21f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "b 21f\n"
+      "20:"  // Height 1: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "21:"  // Height 1: input setup done
+      "cmp x11, #0x4\n"
+      "blt 24f\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q4, [x15, #0x0]\n"
+      "cmp x11, #0x8\n"
+      "blt 23f\n"
+      "22:"  // Height 1: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr d5, [x15, #0x10]\n"
+      "ldr x9, [x15, #0x18]\n"
+      "add x10, x10, #0x10\n"
+      "ldr d6, [x15, #0x20]\n"
+      "sub x11, x11, #0x4\n"
+      "ldr x28, [x15, #0x28]\n"
+      "cmp x11, #0x8\n"
+      "mov v5.d[1], x9\n"
+      "ldr d7, [x15, #0x30]\n"
+      "ldr x27, [x15, #0x38]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "mov v6.d[1], x28\n"
+      "ldr d4, [x15, #0x40]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "mov v7.d[1], x27\n"
+      "ldr x26, [x15, #0x48]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr d5, [x15, #0x50]\n"
+      "ldr x9, [x15, #0x58]\n"
+      "mov v4.d[1], x26\n"
+      "ldr d6, [x15, #0x60]\n"
+      "ldr x28, [x15, #0x68]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "mov v5.d[1], x9\n"
+      "ldr d7, [x15, #0x70]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "mov v6.d[1], x28\n"
+      "ldr x27, [x15, #0x78]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "ldr d4, [x15, #0x80]\n"
+      "ldr x26, [x15, #0x88]\n"
+      "mov v7.d[1], x27\n"
+      "ldr d5, [x15, #0x90]\n"
+      "ldr x9, [x15, #0x98]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "mov v4.d[1], x26\n"
+      "ldr d6, [x15, #0xa0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "mov v5.d[1], x9\n"
+      "ldr x28, [x15, #0xa8]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "ldr d7, [x15, #0xb0]\n"
+      "ldr x27, [x15, #0xb8]\n"
+      "mov v6.d[1], x28\n"
+      "ldr d4, [x15, #0xc0]\n"
+      "ldr x26, [x15, #0xc8]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "mov v7.d[1], x27\n"
+      "ldr d5, [x15, #0xd0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "mov v4.d[1], x26\n"
+      "ldr x9, [x15, #0xd8]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "ldr d6, [x15, #0xe0]\n"
+      "ldr x28, [x15, #0xe8]\n"
+      "mov v5.d[1], x9\n"
+      "ldr d7, [x15, #0xf0]\n"
+      "ldr x27, [x15, #0xf8]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "mov v6.d[1], x28\n"
+      "ldr d4, [x15, #0x100]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "mov v7.d[1], x27\n"
+      "ldr x26, [x15, #0x108]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "ldr d5, [x15, #0x110]\n"
+      "ldr x9, [x15, #0x118]\n"
+      "mov v4.d[1], x26\n"
+      "ldr d6, [x15, #0x120]\n"
+      "ldr x28, [x15, #0x128]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "mov v5.d[1], x9\n"
+      "ldr d7, [x15, #0x130]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "mov v6.d[1], x28\n"
+      "ldr x27, [x15, #0x138]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "ldr d4, [x15, #0x140]\n"
+      "ldr x26, [x15, #0x148]\n"
+      "mov v7.d[1], x27\n"
+      "ldr d5, [x15, #0x150]\n"
+      "ldr x9, [x15, #0x158]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "mov v4.d[1], x26\n"
+      "ldr d6, [x15, #0x160]\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "mov v5.d[1], x9\n"
+      "ldr x28, [x15, #0x168]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "ldr d7, [x15, #0x170]\n"
+      "ldr x27, [x15, #0x178]\n"
+      "add x15, x15, #0x180\n"
+      "mov v6.d[1], x28\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "ldr x25, [x10, #0x8]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "mov v7.d[1], x27\n"
+      "ldr d4, [x15, #0x0]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "ldr d0, [x10, #0x0]\n"
+      "ldr x26, [x15, #0x8]\n"
+      "mov v0.d[1], x25\n"
+      "mov v4.d[1], x26\n"
+      "bge 22b\n"
+      "23:"  // Height 1: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "sub x11, x11, #0x4\n"
+      "ldr q7, [x15, #0x30]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q4, [x15, #0x40]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q5, [x15, #0x50]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "ldr q4, [x15, #0x80]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "ldr q5, [x15, #0x90]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "ldr q4, [x15, #0xc0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "ldr q5, [x15, #0xd0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "ldr q4, [x15, #0x100]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "ldr q5, [x15, #0x110]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "ldr q4, [x15, #0x140]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "ldr q5, [x15, #0x150]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "add x15, x15, #0x180\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "24:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x11, 26f\n"
+      "25:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "sub x11, x11, #0x1\n"
+      "ldr q4, [x15, #0x0]\n"
+      "ldr q5, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q4, [x15, #0x40]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q5, [x15, #0x50]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "add x15, x15, #0x60\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "cbnz x11, 25b\n"
+      "26:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 19b\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "27:"  // Height 1: No activation
+      "cmp x16, #0x18\n"
+      "bge 40f\n"
+      "tbz x16, #4, 31f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v11.4s }, [x13], #0x10\n"
+      "tbz x16, #2, 29f\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 28f\n"
+      "str d13, [x13], #0x8\n"
+      "tbz x16, #0, 39f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "b 39f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 39f\n"
+      "str s13, [x13, #0x0]\n"
+      "b 39f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 30f\n"
+      "str d12, [x13], #0x8\n"
+      "tbz x16, #0, 39f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "b 39f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 39f\n"
+      "str s12, [x13, #0x0]\n"
+      "b 39f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 35f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "tbz x16, #2, 33f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 32f\n"
+      "str d11, [x13], #0x8\n"
+      "tbz x16, #0, 39f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "b 39f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 39f\n"
+      "str s11, [x13, #0x0]\n"
+      "b 39f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 34f\n"
+      "str d10, [x13], #0x8\n"
+      "tbz x16, #0, 39f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "b 39f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 39f\n"
+      "str s10, [x13, #0x0]\n"
+      "b 39f\n"
+      "35:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 37f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "tbz x16, #1, 36f\n"
+      "str d9, [x13], #0x8\n"
+      "tbz x16, #0, 39f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "b 39f\n"
+      "36:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 39f\n"
+      "str s9, [x13, #0x0]\n"
+      "b 39f\n"
+      "37:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 38f\n"
+      "str d8, [x13], #0x8\n"
+      "tbz x16, #0, 39f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "b 39f\n"
+      "38:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "39:"  // Height 1: Partial direct writeback: Done
+      "b 41f\n"
+      "40:"  // Height 1: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x13, #0x40]\n"
+      "str q13, [x13, #0x50]\n"
+      "add x13, x13, #0x60\n"
+      "41:"  // Height 1: Writeback done
+      "subs x16, x16, #0x18\n"
+      "bgt 2b\n"
+      "b 166f\n"
+      "42:"  // Height 2
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "43:"  // Height 2: Column loop
+      "cbz x14, 44f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v14.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v15.16b, v9.16b\n"
+      "ldr q12, [x14, #0x40]\n"
+      "mov v16.16b, v10.16b\n"
+      "ldr q13, [x14, #0x50]\n"
+      "mov v17.16b, v11.16b\n"
+      "add x14, x14, #0x60\n"
+      "mov v18.16b, v12.16b\n"
+      "mov v19.16b, v13.16b\n"
+      "b 59f\n"
+      "44:"  // Height 2: no bias
+      "tbz %x[flags], #0, 58f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x16, #0x18\n"
+      "add x23, x13, x19, LSL #2\n"
+      "bge 57f\n"
+      "tbz x16, #4, 48f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 46f\n"
+      "ld1 { v12.4s }, [x13], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 45f\n"
+      "mov x19, #0x58\n"
+      "ldr d13, [x13], #0x8\n"
+      "ldr d19, [x23], #0x8\n"
+      "tbz x16, #0, 56f\n"
+      "ld1 { v13.s }[2], [x13]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "b 56f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_20
+      "mov x19, #0x50\n"
+      "tbz x16, #0, 56f\n"
+      "ldr s13, [x13, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "b 56f\n"
+      "46:"  // Height 2: Partial accumulate: partial_2_16
+      "tbz x16, #1, 47f\n"
+      "ldr d12, [x13], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "mov x19, #0x48\n"
+      "tbz x16, #0, 56f\n"
+      "ld1 { v12.s }[2], [x13]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "b 56f\n"
+      "47:"  // Height 2: Partial accumulate: partial_1_16
+      "mov x19, #0x40\n"
+      "tbz x16, #0, 56f\n"
+      "ldr s12, [x13, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "b 56f\n"
+      "48:"  // Height 2: Partial accumulate: partial_8_0
+      "tbz x16, #3, 52f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 50f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 49f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x13], #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "tbz x16, #0, 56f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "b 56f\n"
+      "49:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 56f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "b 56f\n"
+      "50:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x16, #1, 51f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x16, #0, 56f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "b 56f\n"
+      "51:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 56f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "b 56f\n"
+      "52:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x16, #2, 54f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 53f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x13], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "tbz x16, #0, 56f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "b 56f\n"
+      "53:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 56f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "b 56f\n"
+      "54:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x16, #1, 55f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x16, #0, 56f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "b 56f\n"
+      "55:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s8, [x13, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s14, [x23, #0x0]\n"
+      "56:"  // Height 2: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 59f\n"
+      "57:"  // Height 2: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x13, #0x40]\n"
+      "ldr q13, [x13, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "b 59f\n"
+      "58:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "59:"  // Height 2: setup done
+      "mov x12, #0x0\n"
+      "60:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x12, 62f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "b 62f\n"
+      "61:"  // Height 2: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x24, x10, x19, LSL #2\n"
+      "62:"  // Height 2: input setup done
+      "cmp x11, #0x4\n"
+      "blt 65f\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "cmp x11, #0x8\n"
+      "ldr q4, [x15, #0x0]\n"
+      "blt 64f\n"
+      "63:"  // Height 2: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr d5, [x15, #0x10]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr x9, [x15, #0x18]\n"
+      "ldr d6, [x15, #0x20]\n"
+      "add x10, x10, #0x10\n"
+      "ldr x28, [x15, #0x28]\n"
+      "add x24, x24, #0x10\n"
+      "mov v5.d[1], x9\n"
+      "ldr d7, [x15, #0x30]\n"
+      "ldr x27, [x15, #0x38]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "ldr d4, [x15, #0x40]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "ldr x26, [x15, #0x48]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "ldr d5, [x15, #0x50]\n"
+      "mov v4.d[1], x26\n"
+      "ldr x9, [x15, #0x58]\n"
+      "ldr d6, [x15, #0x60]\n"
+      "cmp x11, #0x8\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "ldr x28, [x15, #0x68]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "mov v5.d[1], x9\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "ldr d7, [x15, #0x70]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "ldr x27, [x15, #0x78]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "ldr d4, [x15, #0x80]\n"
+      "ldr x26, [x15, #0x88]\n"
+      "mov v7.d[1], x27\n"
+      "ldr d5, [x15, #0x90]\n"
+      "ldr x9, [x15, #0x98]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "mov v4.d[1], x26\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "ldr d6, [x15, #0xa0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "mov v5.d[1], x9\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "ldr x28, [x15, #0xa8]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "ldr d7, [x15, #0xb0]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "ldr x27, [x15, #0xb8]\n"
+      "mov v6.d[1], x28\n"
+      "ldr d4, [x15, #0xc0]\n"
+      "ldr x26, [x15, #0xc8]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "ldr d5, [x15, #0xd0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "mov v4.d[1], x26\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "ldr x9, [x15, #0xd8]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "ldr d6, [x15, #0xe0]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "ldr x28, [x15, #0xe8]\n"
+      "mov v5.d[1], x9\n"
+      "ldr d7, [x15, #0xf0]\n"
+      "ldr x27, [x15, #0xf8]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "ldr d4, [x15, #0x100]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "ldr x26, [x15, #0x108]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "ldr d5, [x15, #0x110]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "ldr x9, [x15, #0x118]\n"
+      "mov v4.d[1], x26\n"
+      "ldr d6, [x15, #0x120]\n"
+      "ldr x28, [x15, #0x128]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "mov v5.d[1], x9\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "ldr d7, [x15, #0x130]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "ldr x27, [x15, #0x138]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "ldr d4, [x15, #0x140]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "ldr x26, [x15, #0x148]\n"
+      "mov v7.d[1], x27\n"
+      "ldr d5, [x15, #0x150]\n"
+      "ldr x9, [x15, #0x158]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "mov v4.d[1], x26\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "ldr d6, [x15, #0x160]\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "mov v5.d[1], x9\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "ldr x28, [x15, #0x168]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "ldr d7, [x15, #0x170]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "ldr x27, [x15, #0x178]\n"
+      "mov v6.d[1], x28\n"
+      "ldr x25, [x10, #0x8]\n"
+      "ldr x23, [x24, #0x8]\n"
+      "add x15, x15, #0x180\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "ldr d4, [x15, #0x0]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "ldr x26, [x15, #0x8]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "ldr d0, [x10, #0x0]\n"
+      "ldr d1, [x24, #0x0]\n"
+      "mov v4.d[1], x26\n"
+      "mov v0.d[1], x25\n"
+      "mov v1.d[1], x23\n"
+      "bge 63b\n"
+      "64:"  // Height 2: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x15, #0x10]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q4, [x15, #0x40]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "ldr q5, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "ldr q4, [x15, #0x80]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "ldr q5, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "ldr q4, [x15, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "ldr q5, [x15, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "ldr q4, [x15, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "ldr q5, [x15, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "ldr q4, [x15, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "ldr q5, [x15, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "add x15, x15, #0x180\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "65:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x11, 67f\n"
+      "66:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "sub x11, x11, #0x1\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr q4, [x15, #0x0]\n"
+      "ldr q5, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q4, [x15, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "ldr q5, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x15, x15, #0x60\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "cbnz x11, 66b\n"
+      "67:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 60b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "add x23, x13, x19, LSL #2\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "tbz %x[flags], #1, 68f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "68:"  // Height 2: No activation
+      "cmp x16, #0x18\n"
+      "bge 81f\n"
+      "tbz x16, #4, 72f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v11.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 70f\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 69f\n"
+      "str d13, [x13], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "b 80f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 80f\n"
+      "str s13, [x13, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "b 80f\n"
+      "70:"  // Height 2: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 71f\n"
+      "str d12, [x13], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "b 80f\n"
+      "71:"  // Height 2: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 80f\n"
+      "str s12, [x13, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "b 80f\n"
+      "72:"  // Height 2: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 76f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "tbz x16, #2, 74f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 73f\n"
+      "str d11, [x13], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "b 80f\n"
+      "73:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 80f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "b 80f\n"
+      "74:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 75f\n"
+      "str d10, [x13], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "b 80f\n"
+      "75:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 80f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "b 80f\n"
+      "76:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 78f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "tbz x16, #1, 77f\n"
+      "str d9, [x13], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "b 80f\n"
+      "77:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 80f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "b 80f\n"
+      "78:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 79f\n"
+      "str d8, [x13], #0x8\n"
+      "str d14, [x23], #0x8\n"
+      "tbz x16, #0, 80f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x23]\n"
+      "b 80f\n"
+      "79:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s14, [x23, #0x0]\n"
+      "80:"  // Height 2: Partial direct writeback: Done
+      "b 82f\n"
+      "81:"  // Height 2: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x13, #0x40]\n"
+      "str q13, [x13, #0x50]\n"
+      "add x13, x13, #0x60\n"
+      "str q14, [x23, #0x0]\n"
+      "str q15, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "str q17, [x23, #0x30]\n"
+      "str q18, [x23, #0x40]\n"
+      "str q19, [x23, #0x50]\n"
+      "82:"  // Height 2: Writeback done
+      "subs x16, x16, #0x18\n"
+      "bgt 43b\n"
+      "b 166f\n"
+      "83:"  // Height 3
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "84:"  // Height 3: Column loop
+      "cbz x14, 85f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v14.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v15.16b, v9.16b\n"
+      "ldr q12, [x14, #0x40]\n"
+      "mov v16.16b, v10.16b\n"
+      "ldr q13, [x14, #0x50]\n"
+      "mov v17.16b, v11.16b\n"
+      "add x14, x14, #0x60\n"
+      "mov v18.16b, v12.16b\n"
+      "mov v19.16b, v13.16b\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "b 100f\n"
+      "85:"  // Height 3: no bias
+      "tbz %x[flags], #0, 99f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x16, #0x18\n"
+      "add x23, x13, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "bge 98f\n"
+      "tbz x16, #4, 89f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "tbz x16, #2, 87f\n"
+      "ld1 { v12.4s }, [x13], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "tbz x16, #1, 86f\n"
+      "ldr d13, [x13], #0x8\n"
+      "mov x19, #0x58\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "tbz x16, #0, 97f\n"
+      "ld1 { v13.s }[2], [x13]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "b 97f\n"
+      "86:"  // Height 3: Partial accumulate: partial_1_20
+      "mov x19, #0x50\n"
+      "tbz x16, #0, 97f\n"
+      "ldr s13, [x13, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "b 97f\n"
+      "87:"  // Height 3: Partial accumulate: partial_2_16
+      "tbz x16, #1, 88f\n"
+      "ldr d12, [x13], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "mov x19, #0x48\n"
+      "ldr d24, [x22], #0x8\n"
+      "tbz x16, #0, 97f\n"
+      "ld1 { v12.s }[2], [x13]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "b 97f\n"
+      "88:"  // Height 3: Partial accumulate: partial_1_16
+      "mov x19, #0x40\n"
+      "tbz x16, #0, 97f\n"
+      "ldr s12, [x13, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "b 97f\n"
+      "89:"  // Height 3: Partial accumulate: partial_8_0
+      "tbz x16, #3, 93f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "tbz x16, #2, 91f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "tbz x16, #1, 90f\n"
+      "ldr d11, [x13], #0x8\n"
+      "mov x19, #0x38\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "tbz x16, #0, 97f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "b 97f\n"
+      "90:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 97f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "b 97f\n"
+      "91:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x16, #1, 92f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d22, [x22], #0x8\n"
+      "tbz x16, #0, 97f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "b 97f\n"
+      "92:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 97f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "b 97f\n"
+      "93:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x16, #2, 95f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "tbz x16, #1, 94f\n"
+      "ldr d9, [x13], #0x8\n"
+      "mov x19, #0x18\n"
+      "ldr d15, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "tbz x16, #0, 97f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "b 97f\n"
+      "94:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 97f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "b 97f\n"
+      "95:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x16, #1, 96f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "tbz x16, #0, 97f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "b 97f\n"
+      "96:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s8, [x13, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s14, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "97:"  // Height 3: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 100f\n"
+      "98:"  // Height 3: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x13, #0x40]\n"
+      "ldr q13, [x13, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x22, #0x40]\n"
+      "ldr q25, [x22, #0x50]\n"
+      "b 100f\n"
+      "99:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "100:"  // Height 3: setup done
+      "mov x12, #0x0\n"
+      "101:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 102f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "cbnz x12, 103f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 103f\n"
+      "102:"  // Height 3: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x24, x10, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "103:"  // Height 3: input setup done
+      "cmp x11, #0x4\n"
+      "blt 106f\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "cmp x11, #0x8\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q4, [x15, #0x0]\n"
+      "blt 105f\n"
+      "104:"  // Height 3: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr d5, [x15, #0x10]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr x9, [x15, #0x18]\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr d6, [x15, #0x20]\n"
+      "ldr x28, [x15, #0x28]\n"
+      "add x10, x10, #0x10\n"
+      "mov v5.d[1], x9\n"
+      "ldr d7, [x15, #0x30]\n"
+      "ldr x27, [x15, #0x38]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "ldr d4, [x15, #0x40]\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr x26, [x15, #0x48]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "ldr d5, [x15, #0x50]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "mov v4.d[1], x26\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "ldr x9, [x15, #0x58]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "ldr d6, [x15, #0x60]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "mov v5.d[1], x9\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "ldr x28, [x15, #0x68]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "ldr d7, [x15, #0x70]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "ldr x27, [x15, #0x78]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "mov v6.d[1], x28\n"
+      "ldr d4, [x15, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v20.4s, v6.4s, v2.s[1]\n"
+      "ldr x26, [x15, #0x88]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "ldr d5, [x15, #0x90]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "ldr x9, [x15, #0x98]\n"
+      "fmla v21.4s, v7.4s, v2.s[1]\n"
+      "mov v4.d[1], x26\n"
+      "ldr d6, [x15, #0xa0]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "mov v5.d[1], x9\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "ldr x28, [x15, #0xa8]\n"
+      "fmla v22.4s, v4.4s, v2.s[1]\n"
+      "ldr d7, [x15, #0xb0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "ldr x27, [x15, #0xb8]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v23.4s, v5.4s, v2.s[1]\n"
+      "ldr d4, [x15, #0xc0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "ldr x26, [x15, #0xc8]\n"
+      "fmla v24.4s, v6.4s, v2.s[1]\n"
+      "ldr d5, [x15, #0xd0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "ldr x9, [x15, #0xd8]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "mov v4.d[1], x26\n"
+      "fmla v25.4s, v7.4s, v2.s[1]\n"
+      "ldr d6, [x15, #0xe0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "mov v5.d[1], x9\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "ldr x28, [x15, #0xe8]\n"
+      "fmla v20.4s, v4.4s, v2.s[2]\n"
+      "ldr d7, [x15, #0xf0]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "ldr x27, [x15, #0xf8]\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v21.4s, v5.4s, v2.s[2]\n"
+      "ldr d4, [x15, #0x100]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "ldr x26, [x15, #0x108]\n"
+      "fmla v22.4s, v6.4s, v2.s[2]\n"
+      "ldr d5, [x15, #0x110]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "ldr x9, [x15, #0x118]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "mov v4.d[1], x26\n"
+      "fmla v23.4s, v7.4s, v2.s[2]\n"
+      "ldr d6, [x15, #0x120]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "mov v5.d[1], x9\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "ldr x28, [x15, #0x128]\n"
+      "fmla v24.4s, v4.4s, v2.s[2]\n"
+      "ldr d7, [x15, #0x130]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "ldr x27, [x15, #0x138]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v25.4s, v5.4s, v2.s[2]\n"
+      "ldr d4, [x15, #0x140]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "ldr x26, [x15, #0x148]\n"
+      "fmla v20.4s, v6.4s, v2.s[3]\n"
+      "ldr d5, [x15, #0x150]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "ldr x9, [x15, #0x158]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "mov v4.d[1], x26\n"
+      "fmla v21.4s, v7.4s, v2.s[3]\n"
+      "ldr d6, [x15, #0x160]\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "mov v5.d[1], x9\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "ldr x28, [x15, #0x168]\n"
+      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "ldr d7, [x15, #0x170]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "ldr x27, [x15, #0x178]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "ldr x25, [x10, #0x8]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "ldr x23, [x24, #0x8]\n"
+      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "ldr x21, [x22, #0x8]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "ldr d0, [x10, #0x0]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "ldr d1, [x24, #0x0]\n"
+      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "ldr d2, [x22, #0x0]\n"
+      "mov v0.d[1], x25\n"
+      "cmp x11, #0x8\n"
+      "mov v1.d[1], x23\n"
+      "add x15, x15, #0x180\n"
+      "mov v2.d[1], x21\n"
+      "ldr d4, [x15, #0x0]\n"
+      "ldr x26, [x15, #0x8]\n"
+      "mov v4.d[1], x26\n"
+      "bge 104b\n"
+      "105:"  // Height 3: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x15, #0x10]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q4, [x15, #0x40]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "ldr q5, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "ldr q4, [x15, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "ldr q5, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v20.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v21.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "fmla v22.4s, v4.4s, v2.s[1]\n"
+      "ldr q4, [x15, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "fmla v23.4s, v5.4s, v2.s[1]\n"
+      "ldr q5, [x15, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "fmla v24.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "fmla v25.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "fmla v20.4s, v4.4s, v2.s[2]\n"
+      "ldr q4, [x15, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "fmla v21.4s, v5.4s, v2.s[2]\n"
+      "ldr q5, [x15, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "fmla v22.4s, v6.4s, v2.s[2]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "fmla v23.4s, v7.4s, v2.s[2]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "fmla v24.4s, v4.4s, v2.s[2]\n"
+      "ldr q4, [x15, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "fmla v25.4s, v5.4s, v2.s[2]\n"
+      "ldr q5, [x15, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v20.4s, v6.4s, v2.s[3]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v21.4s, v7.4s, v2.s[3]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "add x15, x15, #0x180\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "106:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x11, 108f\n"
+      "107:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "sub x11, x11, #0x1\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "ldr q4, [x15, #0x0]\n"
+      "ldr q5, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr q4, [x15, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "ldr q5, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x15, x15, #0x60\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "cbnz x11, 107b\n"
+      "108:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 101b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "add x23, x13, x19, LSL #2\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "tbz %x[flags], #1, 109f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "109:"  // Height 3: No activation
+      "cmp x16, #0x18\n"
+      "bge 122f\n"
+      "tbz x16, #4, 113f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v11.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "tbz x16, #2, 111f\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "tbz x16, #1, 110f\n"
+      "str d13, [x13], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "tbz x16, #0, 121f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "b 121f\n"
+      "110:"  // Height 3: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 121f\n"
+      "str s13, [x13, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "b 121f\n"
+      "111:"  // Height 3: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 112f\n"
+      "str d12, [x13], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "tbz x16, #0, 121f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "b 121f\n"
+      "112:"  // Height 3: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 121f\n"
+      "str s12, [x13, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "b 121f\n"
+      "113:"  // Height 3: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 117f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "tbz x16, #2, 115f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "tbz x16, #1, 114f\n"
+      "str d11, [x13], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "tbz x16, #0, 121f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "b 121f\n"
+      "114:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 121f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "b 121f\n"
+      "115:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 116f\n"
+      "str d10, [x13], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "tbz x16, #0, 121f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "b 121f\n"
+      "116:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 121f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "b 121f\n"
+      "117:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 119f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "tbz x16, #1, 118f\n"
+      "str d9, [x13], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "tbz x16, #0, 121f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "b 121f\n"
+      "118:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 121f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "b 121f\n"
+      "119:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 120f\n"
+      "str d8, [x13], #0x8\n"
+      "str d14, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "tbz x16, #0, 121f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "b 121f\n"
+      "120:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s14, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "121:"  // Height 3: Partial direct writeback: Done
+      "b 123f\n"
+      "122:"  // Height 3: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x13, #0x40]\n"
+      "str q13, [x13, #0x50]\n"
+      "add x13, x13, #0x60\n"
+      "str q14, [x23, #0x0]\n"
+      "str q15, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "str q17, [x23, #0x30]\n"
+      "str q18, [x23, #0x40]\n"
+      "str q19, [x23, #0x50]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x22, #0x40]\n"
+      "str q25, [x22, #0x50]\n"
+      "123:"  // Height 3: Writeback done
+      "subs x16, x16, #0x18\n"
+      "bgt 84b\n"
+      "b 166f\n"
+      "124:"  // Height 4
+      "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x14, %x[bias]\n"
+      "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x13, %x[output_ptr]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x19, #0x10\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "125:"  // Height 4: Column loop
+      "cbz x14, 126f\n"
+      "ldr q8, [x14, #0x0]\n"
+      "ldr q9, [x14, #0x10]\n"
+      "ldr q10, [x14, #0x20]\n"
+      "mov v14.16b, v8.16b\n"
+      "ldr q11, [x14, #0x30]\n"
+      "mov v15.16b, v9.16b\n"
+      "ldr q12, [x14, #0x40]\n"
+      "mov v16.16b, v10.16b\n"
+      "ldr q13, [x14, #0x50]\n"
+      "mov v17.16b, v11.16b\n"
+      "add x14, x14, #0x60\n"
+      "mov v18.16b, v12.16b\n"
+      "mov v19.16b, v13.16b\n"
+      "mov v20.16b, v8.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "mov v26.16b, v8.16b\n"
+      "mov v27.16b, v9.16b\n"
+      "mov v28.16b, v10.16b\n"
+      "mov v29.16b, v11.16b\n"
+      "mov v30.16b, v12.16b\n"
+      "mov v31.16b, v13.16b\n"
+      "b 141f\n"
+      "126:"  // Height 4: no bias
+      "tbz %x[flags], #0, 140f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x16, #0x18\n"
+      "add x23, x13, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "bge 139f\n"
+      "tbz x16, #4, 130f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v11.4s }, [x13], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v23.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "ld1 { v29.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 128f\n"
+      "ld1 { v12.4s }, [x13], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v24.4s }, [x22], #0x10\n"
+      "ld1 { v30.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 127f\n"
+      "ldr d13, [x13], #0x8\n"
+      "mov x19, #0x58\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d25, [x22], #0x8\n"
+      "ldr d31, [x21], #0x8\n"
+      "tbz x16, #0, 138f\n"
+      "ld1 { v13.s }[2], [x13]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v25.s }[2], [x22]\n"
+      "ld1 { v31.s }[2], [x21]\n"
+      "b 138f\n"
+      "127:"  // Height 4: Partial accumulate: partial_1_20
+      "mov x19, #0x50\n"
+      "tbz x16, #0, 138f\n"
+      "ldr s13, [x13, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s25, [x22, #0x0]\n"
+      "ldr s31, [x21, #0x0]\n"
+      "b 138f\n"
+      "128:"  // Height 4: Partial accumulate: partial_2_16
+      "tbz x16, #1, 129f\n"
+      "ldr d12, [x13], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "mov x19, #0x48\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d30, [x21], #0x8\n"
+      "tbz x16, #0, 138f\n"
+      "ld1 { v12.s }[2], [x13]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v30.s }[2], [x21]\n"
+      "b 138f\n"
+      "129:"  // Height 4: Partial accumulate: partial_1_16
+      "mov x19, #0x40\n"
+      "tbz x16, #0, 138f\n"
+      "ldr s12, [x13, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s30, [x21, #0x0]\n"
+      "b 138f\n"
+      "130:"  // Height 4: Partial accumulate: partial_8_0
+      "tbz x16, #3, 134f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x13], #0x10\n"
+      "ld1 { v15.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 132f\n"
+      "ld1 { v10.4s }, [x13], #0x10\n"
+      "ld1 { v16.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v28.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 131f\n"
+      "ldr d11, [x13], #0x8\n"
+      "mov x19, #0x38\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d29, [x21], #0x8\n"
+      "tbz x16, #0, 138f\n"
+      "ld1 { v11.s }[2], [x13]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v29.s }[2], [x21]\n"
+      "b 138f\n"
+      "131:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x16, #0, 138f\n"
+      "ldr s11, [x13, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s29, [x21, #0x0]\n"
+      "b 138f\n"
+      "132:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x16, #1, 133f\n"
+      "ldr d10, [x13], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d28, [x21], #0x8\n"
+      "tbz x16, #0, 138f\n"
+      "ld1 { v10.s }[2], [x13]\n"
+      "ld1 { v16.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v28.s }[2], [x21]\n"
+      "b 138f\n"
+      "133:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x16, #0, 138f\n"
+      "ldr s10, [x13, #0x0]\n"
+      "ldr s16, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s28, [x21, #0x0]\n"
+      "b 138f\n"
+      "134:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x16, #2, 136f\n"
+      "ld1 { v8.4s }, [x13], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 135f\n"
+      "ldr d9, [x13], #0x8\n"
+      "mov x19, #0x18\n"
+      "ldr d15, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "tbz x16, #0, 138f\n"
+      "ld1 { v9.s }[2], [x13]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "b 138f\n"
+      "135:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x16, #0, 138f\n"
+      "ldr s9, [x13, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "b 138f\n"
+      "136:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x16, #1, 137f\n"
+      "ldr d8, [x13], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "tbz x16, #0, 138f\n"
+      "ld1 { v8.s }[2], [x13]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "b 138f\n"
+      "137:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s8, [x13, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s14, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "138:"  // Height 4: Partial accumulate: Done
+      "sub x13, x13, x19\n"
+      "b 141f\n"
+      "139:"  // Height 4: full accumulate
+      "ldr q8, [x13, #0x0]\n"
+      "ldr q9, [x13, #0x10]\n"
+      "ldr q10, [x13, #0x20]\n"
+      "ldr q11, [x13, #0x30]\n"
+      "ldr q12, [x13, #0x40]\n"
+      "ldr q13, [x13, #0x50]\n"
+      "ldr q14, [x23, #0x0]\n"
+      "ldr q15, [x23, #0x10]\n"
+      "ldr q16, [x23, #0x20]\n"
+      "ldr q17, [x23, #0x30]\n"
+      "ldr q18, [x23, #0x40]\n"
+      "ldr q19, [x23, #0x50]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q24, [x22, #0x40]\n"
+      "ldr q25, [x22, #0x50]\n"
+      "ldr q26, [x21, #0x0]\n"
+      "ldr q27, [x21, #0x10]\n"
+      "ldr q28, [x21, #0x20]\n"
+      "ldr q29, [x21, #0x30]\n"
+      "ldr q30, [x21, #0x40]\n"
+      "ldr q31, [x21, #0x50]\n"
+      "b 141f\n"
+      "140:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "141:"  // Height 4: setup done
+      "mov x12, #0x0\n"
+      "142:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w11, [x20, x12, LSL #0x2]\n"
+      "tbz %x[flags], #3, 143f\n"
+      "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x10, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x22, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x12, 144f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x10, x10, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 144f\n"
+      "143:"  // Height 4: setup direct input
+      "mov x10, %x[input_ptr]\n"
+      "add x24, x10, x19, LSL #2\n"
+      "add x22, x24, x19, LSL #2\n"
+      "add x20, x22, x19, LSL #2\n"
+      "144:"  // Height 4: input setup done
+      "cmp x11, #0x4\n"
+      "blt 147f\n"
+      "ldr q0, [x10, #0x0]\n"
+      "ldr q1, [x24, #0x0]\n"
+      "cmp x11, #0x8\n"
+      "ldr q2, [x22, #0x0]\n"
+      "ldr q3, [x20, #0x0]\n"
+      "ldr q4, [x15, #0x0]\n"
+      "blt 146f\n"
+      "145:"  // Height 4: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr d5, [x15, #0x10]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr x9, [x15, #0x18]\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr d6, [x15, #0x20]\n"
+      "fmla v26.4s, v4.4s, v3.s[0]\n"
+      "ldr x28, [x15, #0x28]\n"
+      "mov v5.d[1], x9\n"
+      "ldr d7, [x15, #0x30]\n"
+      "ldr x27, [x15, #0x38]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "ldr d4, [x15, #0x40]\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v27.4s, v5.4s, v3.s[0]\n"
+      "ldr x26, [x15, #0x48]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "ldr d5, [x15, #0x50]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "mov v4.d[1], x26\n"
+      "fmla v28.4s, v6.4s, v3.s[0]\n"
+      "ldr x9, [x15, #0x58]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr d6, [x15, #0x60]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "ldr x28, [x15, #0x68]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "mov v5.d[1], x9\n"
+      "fmla v29.4s, v7.4s, v3.s[0]\n"
+      "ldr d7, [x15, #0x70]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "ldr x27, [x15, #0x78]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "ldr x26, [x15, #0x88]\n"
+      "fmla v30.4s, v4.4s, v3.s[0]\n"
+      "ldr d4, [x15, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "ldr x9, [x15, #0x98]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "mov v4.d[1], x26\n"
+      "fmla v31.4s, v5.4s, v3.s[0]\n"
+      "ldr d5, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "ldr x28, [x15, #0xa8]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "ldr x27, [x15, #0xb8]\n"
+      "fmla v20.4s, v6.4s, v2.s[1]\n"
+      "mov v5.d[1], x9\n"
+      "fmla v26.4s, v6.4s, v3.s[1]\n"
+      "ldr d6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "ldr x26, [x15, #0xc8]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "ldr x9, [x15, #0xd8]\n"
+      "fmla v21.4s, v7.4s, v2.s[1]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v27.4s, v7.4s, v3.s[1]\n"
+      "ldr d7, [x15, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "ldr x28, [x15, #0xe8]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "ldr x25, [x10, #0x8]\n"
+      "fmla v22.4s, v4.4s, v2.s[1]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v28.4s, v4.4s, v3.s[1]\n"
+      "ldr d4, [x15, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "ldr x27, [x15, #0xf8]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v23.4s, v5.4s, v2.s[1]\n"
+      "mov v4.d[1], x26\n"
+      "fmla v29.4s, v5.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "ldr d5, [x15, #0xd0]\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "ldr x26, [x15, #0x108]\n"
+      "fmla v24.4s, v6.4s, v2.s[1]\n"
+      "ldr x23, [x24, #0x8]\n"
+      "fmla v30.4s, v6.4s, v3.s[1]\n"
+      "mov v5.d[1], x9\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "ldr d6, [x15, #0xe0]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "ldr x9, [x15, #0x118]\n"
+      "fmla v25.4s, v7.4s, v2.s[1]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v31.4s, v7.4s, v3.s[1]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "ldr d7, [x15, #0xf0]\n"
+      "fmla v20.4s, v4.4s, v2.s[2]\n"
+      "ldr x28, [x15, #0x128]\n"
+      "fmla v26.4s, v4.4s, v3.s[2]\n"
+      "ldr d4, [x15, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "ldr x27, [x15, #0x138]\n"
+      "fmla v21.4s, v5.4s, v2.s[2]\n"
+      "mov v4.d[1], x26\n"
+      "fmla v27.4s, v5.4s, v3.s[2]\n"
+      "ldr d5, [x15, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "ldr x26, [x15, #0x148]\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "ldr x21, [x22, #0x8]\n"
+      "fmla v22.4s, v6.4s, v2.s[2]\n"
+      "mov v5.d[1], x9\n"
+      "fmla v28.4s, v6.4s, v3.s[2]\n"
+      "ldr d6, [x15, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "ldr x9, [x15, #0x158]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v23.4s, v7.4s, v2.s[2]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v29.4s, v7.4s, v3.s[2]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "ldr d7, [x15, #0x130]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "ldr x28, [x15, #0x168]\n"
+      "fmla v24.4s, v4.4s, v2.s[2]\n"
+      "ldr x19, [x20, #0x8]\n"
+      "fmla v30.4s, v4.4s, v3.s[2]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "ldr d4, [x15, #0x140]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "ldr x27, [x15, #0x178]\n"
+      "fmla v25.4s, v5.4s, v2.s[2]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v31.4s, v5.4s, v3.s[2]\n"
+      "mov v4.d[1], x26\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "ldr d5, [x15, #0x150]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "cmp x11, #0x8\n"
+      "fmla v20.4s, v6.4s, v2.s[3]\n"
+      "fmla v26.4s, v6.4s, v3.s[3]\n"
+      "mov v5.d[1], x9\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "ldr d6, [x15, #0x160]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v21.4s, v7.4s, v2.s[3]\n"
+      "fmla v27.4s, v7.4s, v3.s[3]\n"
+      "mov v6.d[1], x28\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "ldr d7, [x15, #0x170]\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "add x15, x15, #0x180\n"
+      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "ldr x26, [x15, #0x8]\n"
+      "fmla v28.4s, v4.4s, v3.s[3]\n"
+      "mov v7.d[1], x27\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "ldr d4, [x15, #0x0]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "fmla v29.4s, v5.4s, v3.s[3]\n"
+      "mov v4.d[1], x26\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "fmla v30.4s, v6.4s, v3.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "ldr d0, [x10, #0x0]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "ldr d1, [x24, #0x0]\n"
+      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "ldr d2, [x22, #0x0]\n"
+      "fmla v31.4s, v7.4s, v3.s[3]\n"
+      "mov v0.d[1], x25\n"
+      "mov v1.d[1], x23\n"
+      "ldr d3, [x20, #0x0]\n"
+      "mov v2.d[1], x21\n"
+      "mov v3.d[1], x19\n"
+      "bge 145b\n"
+      "146:"  // Height 4: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x15, #0x10]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v26.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x15, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "sub x11, x11, #0x4\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "add x10, x10, #0x10\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x10, #0x80]\n"
+      "fmla v27.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x24, x24, #0x10\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v28.4s, v6.4s, v3.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr q6, [x15, #0x60]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla v29.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x15, #0x70]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "fmla v30.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x15, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "fmla v31.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x15, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v20.4s, v6.4s, v2.s[1]\n"
+      "fmla v26.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x15, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v21.4s, v7.4s, v2.s[1]\n"
+      "fmla v27.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x15, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "fmla v22.4s, v4.4s, v2.s[1]\n"
+      "fmla v28.4s, v4.4s, v3.s[1]\n"
+      "ldr q4, [x15, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "fmla v23.4s, v5.4s, v2.s[1]\n"
+      "fmla v29.4s, v5.4s, v3.s[1]\n"
+      "ldr q5, [x15, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "fmla v24.4s, v6.4s, v2.s[1]\n"
+      "fmla v30.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x15, #0xe0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "fmla v25.4s, v7.4s, v2.s[1]\n"
+      "fmla v31.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x15, #0xf0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "fmla v20.4s, v4.4s, v2.s[2]\n"
+      "fmla v26.4s, v4.4s, v3.s[2]\n"
+      "ldr q4, [x15, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "fmla v21.4s, v5.4s, v2.s[2]\n"
+      "fmla v27.4s, v5.4s, v3.s[2]\n"
+      "ldr q5, [x15, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "fmla v22.4s, v6.4s, v2.s[2]\n"
+      "fmla v28.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x15, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "fmla v23.4s, v7.4s, v2.s[2]\n"
+      "fmla v29.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x15, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "fmla v24.4s, v4.4s, v2.s[2]\n"
+      "fmla v30.4s, v4.4s, v3.s[2]\n"
+      "ldr q4, [x15, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "fmla v25.4s, v5.4s, v2.s[2]\n"
+      "fmla v31.4s, v5.4s, v3.s[2]\n"
+      "ldr q5, [x15, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v20.4s, v6.4s, v2.s[3]\n"
+      "fmla v26.4s, v6.4s, v3.s[3]\n"
+      "ldr q6, [x15, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v21.4s, v7.4s, v2.s[3]\n"
+      "fmla v27.4s, v7.4s, v3.s[3]\n"
+      "ldr q7, [x15, #0x170]\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "add x15, x15, #0x180\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "fmla v28.4s, v4.4s, v3.s[3]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "fmla v29.4s, v5.4s, v3.s[3]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "fmla v30.4s, v6.4s, v3.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "fmla v31.4s, v7.4s, v3.s[3]\n"
+      "147:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x11, 149f\n"
+      "148:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x10], #0x4\n"
+      "sub x11, x11, #0x1\n"
+      "ldr s1, [x24], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "ldr s3, [x20], #0x4\n"
+      "ldr q4, [x15, #0x0]\n"
+      "ldr q5, [x15, #0x10]\n"
+      "ldr q6, [x15, #0x20]\n"
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q7, [x15, #0x30]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "fmla v26.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x15, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "fmla v27.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x15, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "add x15, x15, #0x60\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "fmla v28.4s, v6.4s, v3.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "fmla v29.4s, v7.4s, v3.s[0]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "fmla v30.4s, v4.4s, v3.s[0]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "fmla v31.4s, v5.4s, v3.s[0]\n"
+      "cbnz x11, 148b\n"
+      "149:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x12, x12, #0x1\n"
+      "cmp x12, x19\n"
+      "bne 142b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "prfm pstl1keep, [x13, #0x0]\n"
+      "add x23, x13, x19, LSL #2\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 150f\n"
+      "add x20, %x[args_ptr], %[offset_min]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v1.4s }, [x20]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "fmin v28.4s, v28.4s, v0.4s\n"
+      "fmin v29.4s, v29.4s, v0.4s\n"
+      "fmin v30.4s, v30.4s, v0.4s\n"
+      "fmin v31.4s, v31.4s, v0.4s\n"
+      "fmax v28.4s, v28.4s, v1.4s\n"
+      "fmax v29.4s, v29.4s, v1.4s\n"
+      "fmax v30.4s, v30.4s, v1.4s\n"
+      "fmax v31.4s, v31.4s, v1.4s\n"
+      "150:"  // Height 4: No activation
+      "cmp x16, #0x18\n"
+      "bge 163f\n"
+      "tbz x16, #4, 154f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v11.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v27.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 152f\n"
+      "st1 { v12.4s }, [x13], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "st1 { v24.4s }, [x22], #0x10\n"
+      "st1 { v30.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 151f\n"
+      "str d13, [x13], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "str d25, [x22], #0x8\n"
+      "str d31, [x21], #0x8\n"
+      "tbz x16, #0, 162f\n"
+      "st1 { v13.s }[2], [x13]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "st1 { v25.s }[2], [x22]\n"
+      "st1 { v31.s }[2], [x21]\n"
+      "b 162f\n"
+      "151:"  // Height 4: Partial direct writeback: partial_1_20
+      "tbz x16, #0, 162f\n"
+      "str s13, [x13, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "str s25, [x22, #0x0]\n"
+      "str s31, [x21, #0x0]\n"
+      "b 162f\n"
+      "152:"  // Height 4: Partial direct writeback: partial_2_16
+      "tbz x16, #1, 153f\n"
+      "str d12, [x13], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "str d24, [x22], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "tbz x16, #0, 162f\n"
+      "st1 { v12.s }[2], [x13]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "st1 { v24.s }[2], [x22]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "b 162f\n"
+      "153:"  // Height 4: Partial direct writeback: partial_1_16
+      "tbz x16, #0, 162f\n"
+      "str s12, [x13, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "str s24, [x22, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "b 162f\n"
+      "154:"  // Height 4: Partial direct writeback: partial_8_0
+      "tbz x16, #3, 158f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v9.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v27.4s }, [x21], #0x10\n"
+      "tbz x16, #2, 156f\n"
+      "st1 { v10.4s }, [x13], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v22.4s }, [x22], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 155f\n"
+      "str d11, [x13], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "str d23, [x22], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "tbz x16, #0, 162f\n"
+      "st1 { v11.s }[2], [x13]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "st1 { v23.s }[2], [x22]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "b 162f\n"
+      "155:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x16, #0, 162f\n"
+      "str s11, [x13, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "str s23, [x22, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "b 162f\n"
+      "156:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x16, #1, 157f\n"
+      "str d10, [x13], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "tbz x16, #0, 162f\n"
+      "st1 { v10.s }[2], [x13]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "b 162f\n"
+      "157:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x16, #0, 162f\n"
+      "str s10, [x13, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "b 162f\n"
+      "158:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x16, #2, 160f\n"
+      "st1 { v8.4s }, [x13], #0x10\n"
+      "st1 { v14.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x16, #1, 159f\n"
+      "str d9, [x13], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x16, #0, 162f\n"
+      "st1 { v9.s }[2], [x13]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "b 162f\n"
+      "159:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x16, #0, 162f\n"
+      "str s9, [x13, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "b 162f\n"
+      "160:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x16, #1, 161f\n"
+      "str d8, [x13], #0x8\n"
+      "str d14, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x16, #0, 162f\n"
+      "st1 { v8.s }[2], [x13]\n"
+      "st1 { v14.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "b 162f\n"
+      "161:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s8, [x13, #0x0]\n"
+      "str s14, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "162:"  // Height 4: Partial direct writeback: Done
+      "b 164f\n"
+      "163:"  // Height 4: Full writeback
+      "str q8, [x13, #0x0]\n"
+      "str q9, [x13, #0x10]\n"
+      "str q10, [x13, #0x20]\n"
+      "str q11, [x13, #0x30]\n"
+      "str q12, [x13, #0x40]\n"
+      "str q13, [x13, #0x50]\n"
+      "add x13, x13, #0x60\n"
+      "str q14, [x23, #0x0]\n"
+      "str q15, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "str q17, [x23, #0x30]\n"
+      "str q18, [x23, #0x40]\n"
+      "str q19, [x23, #0x50]\n"
+      "str q20, [x22, #0x0]\n"
+      "str q21, [x22, #0x10]\n"
+      "str q22, [x22, #0x20]\n"
+      "str q23, [x22, #0x30]\n"
+      "str q24, [x22, #0x40]\n"
+      "str q25, [x22, #0x50]\n"
+      "str q26, [x21, #0x0]\n"
+      "str q27, [x21, #0x10]\n"
+      "str q28, [x21, #0x20]\n"
+      "str q29, [x21, #0x30]\n"
+      "str q30, [x21, #0x40]\n"
+      "str q31, [x21, #0x50]\n"
+      "164:"  // Height 4: Writeback done
+      "subs x16, x16, #0x18\n"
+      "bgt 125b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 166f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 165f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "165:"  // Update direct input
+      "mov x19, #0x10\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "166:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp
new file mode 100644
index 0000000000..37d59cc327
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp
@@ -0,0 +1,2595 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_4x24 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 124f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 83f\n"
+      "beq 42f\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[bias]\n"
+      "mov x26, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "cbz x27, 3f\n"
+      "ldr q8, [x27, #0x0]\n"
+      "ldr q9, [x27, #0x10]\n"
+      "ldr q10, [x27, #0x20]\n"
+      "ldr q11, [x27, #0x30]\n"
+      "ldr q12, [x27, #0x40]\n"
+      "ldr q13, [x27, #0x50]\n"
+      "add x27, x27, #0x60\n"
+      "b 18f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 17f\n"
+      "cmp x9, #0x18\n"
+      "bge 16f\n"
+      "tbz x9, #4, 7f\n"
+      "ld1 { v8.4s }, [x26], #0x10\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v11.4s }, [x26], #0x10\n"
+      "tbz x9, #2, 5f\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "tbz x9, #1, 4f\n"
+      "mov x19, #0x58\n"
+      "ldr d13, [x26], #0x8\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v13.s }[2], [x26]\n"
+      "b 15f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_20
+      "mov x19, #0x50\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s13, [x26, #0x0]\n"
+      "b 15f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_16
+      "tbz x9, #1, 6f\n"
+      "ldr d12, [x26], #0x8\n"
+      "mov x19, #0x48\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v12.s }[2], [x26]\n"
+      "b 15f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_16
+      "mov x19, #0x40\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s12, [x26, #0x0]\n"
+      "b 15f\n"
+      "7:"  // Height 1: Partial accumulate: partial_8_0
+      "tbz x9, #3, 11f\n"
+      "ld1 { v8.4s }, [x26], #0x10\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "tbz x9, #2, 9f\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "tbz x9, #1, 8f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x26], #0x8\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v11.s }[2], [x26]\n"
+      "b 15f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s11, [x26, #0x0]\n"
+      "b 15f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x9, #1, 10f\n"
+      "ldr d10, [x26], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v10.s }[2], [x26]\n"
+      "b 15f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s10, [x26, #0x0]\n"
+      "b 15f\n"
+      "11:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x9, #2, 13f\n"
+      "ld1 { v8.4s }, [x26], #0x10\n"
+      "tbz x9, #1, 12f\n"
+      "ldr d9, [x26], #0x8\n"
+      "mov x19, #0x18\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v9.s }[2], [x26]\n"
+      "b 15f\n"
+      "12:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s9, [x26, #0x0]\n"
+      "b 15f\n"
+      "13:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x9, #1, 14f\n"
+      "ldr d8, [x26], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v8.s }[2], [x26]\n"
+      "b 15f\n"
+      "14:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s8, [x26, #0x0]\n"
+      "mov x19, #0x0\n"
+      "15:"  // Height 1: Partial accumulate: Done
+      "sub x26, x26, x19\n"
+      "b 18f\n"
+      "16:"  // Height 1: full accumulate
+      "ldr q8, [x26, #0x0]\n"
+      "ldr q9, [x26, #0x10]\n"
+      "ldr q10, [x26, #0x20]\n"
+      "ldr q11, [x26, #0x30]\n"
+      "ldr q12, [x26, #0x40]\n"
+      "ldr q13, [x26, #0x50]\n"
+      "b 18f\n"
+      "17:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "18:"  // Height 1: setup done
+      "mov x25, #0x0\n"
+      "19:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "cbnz x25, 21f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 21f\n"
+      "20:"  // Height 1: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "21:"  // Height 1: input setup done
+      "cmp x24, #0x4\n"
+      "blt 24f\n"
+      "ldr q0, [x23, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "cmp x24, #0x8\n"
+      "blt 23f\n"
+      "22:"  // Height 1: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "sub x24, x24, #0x4\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "cmp x24, #0x8\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "ldr q6, [x28, #0x60]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "ldr q7, [x28, #0x70]\n"
+      "ldr q4, [x28, #0x80]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "ldr q5, [x28, #0x90]\n"
+      "ldr q6, [x28, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "ldr q4, [x28, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "ldr q5, [x28, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "ldr q6, [x28, #0xe0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "ldr q7, [x28, #0xf0]\n"
+      "ldr q4, [x28, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "ldr q5, [x28, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "ldr q6, [x28, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "ldr q7, [x28, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "ldr q4, [x28, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "ldr q5, [x28, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "ldr q6, [x28, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "ldr q7, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "ldr q0, [x23, #0x0]\n"
+      "bge 22b\n"
+      "23:"  // Height 1: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "sub x24, x24, #0x4\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "ldr q6, [x28, #0x60]\n"
+      "ldr q7, [x28, #0x70]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "ldr q4, [x28, #0x80]\n"
+      "ldr q5, [x28, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "ldr q6, [x28, #0xa0]\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "ldr q4, [x28, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "ldr q5, [x28, #0xd0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "ldr q6, [x28, #0xe0]\n"
+      "ldr q7, [x28, #0xf0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "ldr q4, [x28, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "ldr q5, [x28, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "ldr q6, [x28, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "ldr q7, [x28, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "ldr q4, [x28, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "ldr q5, [x28, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "ldr q6, [x28, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "ldr q7, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "24:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x24, 26f\n"
+      "25:"  // Height 1: Multiply loop: Odd block loop
+      "ldr s0, [x23], #0x4\n"
+      "sub x24, x24, #0x1\n"
+      "ldr q4, [x28, #0x0]\n"
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "add x28, x28, #0x60\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "cbnz x24, 25b\n"
+      "26:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 19b\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "tbz %x[flags], #1, 27f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "27:"  // Height 1: No activation
+      "cmp x9, #0x18\n"
+      "bge 40f\n"
+      "tbz x9, #4, 31f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "st1 { v11.4s }, [x26], #0x10\n"
+      "tbz x9, #2, 29f\n"
+      "st1 { v12.4s }, [x26], #0x10\n"
+      "tbz x9, #1, 28f\n"
+      "str d13, [x26], #0x8\n"
+      "tbz x9, #0, 39f\n"
+      "st1 { v13.s }[2], [x26]\n"
+      "b 39f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 39f\n"
+      "str s13, [x26, #0x0]\n"
+      "b 39f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 30f\n"
+      "str d12, [x26], #0x8\n"
+      "tbz x9, #0, 39f\n"
+      "st1 { v12.s }[2], [x26]\n"
+      "b 39f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 39f\n"
+      "str s12, [x26, #0x0]\n"
+      "b 39f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 35f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "tbz x9, #2, 33f\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "tbz x9, #1, 32f\n"
+      "str d11, [x26], #0x8\n"
+      "tbz x9, #0, 39f\n"
+      "st1 { v11.s }[2], [x26]\n"
+      "b 39f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 39f\n"
+      "str s11, [x26, #0x0]\n"
+      "b 39f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 34f\n"
+      "str d10, [x26], #0x8\n"
+      "tbz x9, #0, 39f\n"
+      "st1 { v10.s }[2], [x26]\n"
+      "b 39f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 39f\n"
+      "str s10, [x26, #0x0]\n"
+      "b 39f\n"
+      "35:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 37f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "tbz x9, #1, 36f\n"
+      "str d9, [x26], #0x8\n"
+      "tbz x9, #0, 39f\n"
+      "st1 { v9.s }[2], [x26]\n"
+      "b 39f\n"
+      "36:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 39f\n"
+      "str s9, [x26, #0x0]\n"
+      "b 39f\n"
+      "37:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 38f\n"
+      "str d8, [x26], #0x8\n"
+      "tbz x9, #0, 39f\n"
+      "st1 { v8.s }[2], [x26]\n"
+      "b 39f\n"
+      "38:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x26, #0x0]\n"
+      "39:"  // Height 1: Partial direct writeback: Done
+      "b 41f\n"
+      "40:"  // Height 1: Full writeback
+      "str q8, [x26, #0x0]\n"
+      "str q9, [x26, #0x10]\n"
+      "str q10, [x26, #0x20]\n"
+      "str q11, [x26, #0x30]\n"
+      "str q12, [x26, #0x40]\n"
+      "str q13, [x26, #0x50]\n"
+      "add x26, x26, #0x60\n"
+      "41:"  // Height 1: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 2b\n"
+      "b 166f\n"
+      "42:"  // Height 2
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[bias]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "43:"  // Height 2: Column loop
+      "cbz x27, 44f\n"
+      "ldr q8, [x27, #0x0]\n"
+      "mov v14.16b, v8.16b\n"
+      "ldr q9, [x27, #0x10]\n"
+      "ldr q10, [x27, #0x20]\n"
+      "mov v15.16b, v9.16b\n"
+      "ldr q11, [x27, #0x30]\n"
+      "mov v16.16b, v10.16b\n"
+      "ldr q12, [x27, #0x40]\n"
+      "ldr q13, [x27, #0x50]\n"
+      "mov v17.16b, v11.16b\n"
+      "add x27, x27, #0x60\n"
+      "mov v18.16b, v12.16b\n"
+      "mov v19.16b, v13.16b\n"
+      "b 59f\n"
+      "44:"  // Height 2: no bias
+      "tbz %x[flags], #0, 58f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x9, #0x18\n"
+      "add x22, x26, x19, LSL #2\n"
+      "bge 57f\n"
+      "tbz x9, #4, 48f\n"
+      "ld1 { v8.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v15.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v16.4s }, [x22], #0x10\n"
+      "ld1 { v11.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 46f\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 45f\n"
+      "mov x19, #0x58\n"
+      "ldr d13, [x26], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "tbz x9, #0, 56f\n"
+      "ld1 { v13.s }[2], [x26]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "b 56f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_20
+      "mov x19, #0x50\n"
+      "tbz x9, #0, 56f\n"
+      "ldr s13, [x26, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "b 56f\n"
+      "46:"  // Height 2: Partial accumulate: partial_2_16
+      "tbz x9, #1, 47f\n"
+      "ldr d12, [x26], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "mov x19, #0x48\n"
+      "tbz x9, #0, 56f\n"
+      "ld1 { v12.s }[2], [x26]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "b 56f\n"
+      "47:"  // Height 2: Partial accumulate: partial_1_16
+      "mov x19, #0x40\n"
+      "tbz x9, #0, 56f\n"
+      "ldr s12, [x26, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "b 56f\n"
+      "48:"  // Height 2: Partial accumulate: partial_8_0
+      "tbz x9, #3, 52f\n"
+      "ld1 { v8.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v15.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 50f\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v16.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 49f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x26], #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "tbz x9, #0, 56f\n"
+      "ld1 { v11.s }[2], [x26]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "b 56f\n"
+      "49:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x9, #0, 56f\n"
+      "ldr s11, [x26, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "b 56f\n"
+      "50:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x9, #1, 51f\n"
+      "ldr d10, [x26], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x9, #0, 56f\n"
+      "ld1 { v10.s }[2], [x26]\n"
+      "ld1 { v16.s }[2], [x22]\n"
+      "b 56f\n"
+      "51:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x9, #0, 56f\n"
+      "ldr s10, [x26, #0x0]\n"
+      "ldr s16, [x22, #0x0]\n"
+      "b 56f\n"
+      "52:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x9, #2, 54f\n"
+      "ld1 { v8.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 53f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x26], #0x8\n"
+      "ldr d15, [x22], #0x8\n"
+      "tbz x9, #0, 56f\n"
+      "ld1 { v9.s }[2], [x26]\n"
+      "ld1 { v15.s }[2], [x22]\n"
+      "b 56f\n"
+      "53:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x9, #0, 56f\n"
+      "ldr s9, [x26, #0x0]\n"
+      "ldr s15, [x22, #0x0]\n"
+      "b 56f\n"
+      "54:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x9, #1, 55f\n"
+      "ldr d8, [x26], #0x8\n"
+      "ldr d14, [x22], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x9, #0, 56f\n"
+      "ld1 { v8.s }[2], [x26]\n"
+      "ld1 { v14.s }[2], [x22]\n"
+      "b 56f\n"
+      "55:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s8, [x26, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s14, [x22, #0x0]\n"
+      "56:"  // Height 2: Partial accumulate: Done
+      "sub x26, x26, x19\n"
+      "b 59f\n"
+      "57:"  // Height 2: full accumulate
+      "ldr q8, [x26, #0x0]\n"
+      "ldr q9, [x26, #0x10]\n"
+      "ldr q10, [x26, #0x20]\n"
+      "ldr q11, [x26, #0x30]\n"
+      "ldr q12, [x26, #0x40]\n"
+      "ldr q13, [x26, #0x50]\n"
+      "ldr q14, [x22, #0x0]\n"
+      "ldr q15, [x22, #0x10]\n"
+      "ldr q16, [x22, #0x20]\n"
+      "ldr q17, [x22, #0x30]\n"
+      "ldr q18, [x22, #0x40]\n"
+      "ldr q19, [x22, #0x50]\n"
+      "b 59f\n"
+      "58:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "59:"  // Height 2: setup done
+      "mov x25, #0x0\n"
+      "60:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "cbnz x25, 62f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 62f\n"
+      "61:"  // Height 2: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "62:"  // Height 2: input setup done
+      "cmp x24, #0x4\n"
+      "blt 65f\n"
+      "ldr q0, [x23, #0x0]\n"
+      "ldr q1, [x22, #0x0]\n"
+      "cmp x24, #0x8\n"
+      "ldr q4, [x28, #0x0]\n"
+      "blt 64f\n"
+      "63:"  // Height 2: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "sub x24, x24, #0x4\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "cmp x24, #0x8\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "ldr q6, [x28, #0x60]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "ldr q7, [x28, #0x70]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "ldr q4, [x28, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "ldr q5, [x28, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x28, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "ldr q4, [x28, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "ldr q5, [x28, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x28, #0xe0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x28, #0xf0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "ldr q4, [x28, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "ldr q5, [x28, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "ldr q6, [x28, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "ldr q7, [x28, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "ldr q4, [x28, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "ldr q5, [x28, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "ldr q6, [x28, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "ldr q7, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "ldr q0, [x23, #0x0]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "ldr q1, [x22, #0x0]\n"
+      "bge 63b\n"
+      "64:"  // Height 2: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "sub x24, x24, #0x4\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "ldr q6, [x28, #0x60]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "ldr q7, [x28, #0x70]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "ldr q4, [x28, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "ldr q5, [x28, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x28, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "ldr q4, [x28, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "ldr q5, [x28, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "ldr q6, [x28, #0xe0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "ldr q7, [x28, #0xf0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "ldr q4, [x28, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "ldr q5, [x28, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "ldr q6, [x28, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "ldr q7, [x28, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "ldr q4, [x28, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "ldr q5, [x28, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "ldr q6, [x28, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "ldr q7, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "65:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x24, 67f\n"
+      "66:"  // Height 2: Multiply loop: Odd block loop
+      "ldr s0, [x23], #0x4\n"
+      "sub x24, x24, #0x1\n"
+      "ldr s1, [x22], #0x4\n"
+      "ldr q4, [x28, #0x0]\n"
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "add x28, x28, #0x60\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "cbnz x24, 66b\n"
+      "67:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 60b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "add x22, x26, x19, LSL #2\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "tbz %x[flags], #1, 68f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "68:"  // Height 2: No activation
+      "cmp x9, #0x18\n"
+      "bge 81f\n"
+      "tbz x9, #4, 72f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "st1 { v11.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x22], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 70f\n"
+      "st1 { v12.4s }, [x26], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 69f\n"
+      "str d13, [x26], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "tbz x9, #0, 80f\n"
+      "st1 { v13.s }[2], [x26]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "b 80f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 80f\n"
+      "str s13, [x26, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "b 80f\n"
+      "70:"  // Height 2: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 71f\n"
+      "str d12, [x26], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "tbz x9, #0, 80f\n"
+      "st1 { v12.s }[2], [x26]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "b 80f\n"
+      "71:"  // Height 2: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 80f\n"
+      "str s12, [x26, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "b 80f\n"
+      "72:"  // Height 2: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 76f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x22], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 74f\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 73f\n"
+      "str d11, [x26], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "tbz x9, #0, 80f\n"
+      "st1 { v11.s }[2], [x26]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "b 80f\n"
+      "73:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 80f\n"
+      "str s11, [x26, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "b 80f\n"
+      "74:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 75f\n"
+      "str d10, [x26], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "tbz x9, #0, 80f\n"
+      "st1 { v10.s }[2], [x26]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "b 80f\n"
+      "75:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 80f\n"
+      "str s10, [x26, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "b 80f\n"
+      "76:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 78f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 77f\n"
+      "str d9, [x26], #0x8\n"
+      "str d15, [x22], #0x8\n"
+      "tbz x9, #0, 80f\n"
+      "st1 { v9.s }[2], [x26]\n"
+      "st1 { v15.s }[2], [x22]\n"
+      "b 80f\n"
+      "77:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 80f\n"
+      "str s9, [x26, #0x0]\n"
+      "str s15, [x22, #0x0]\n"
+      "b 80f\n"
+      "78:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 79f\n"
+      "str d8, [x26], #0x8\n"
+      "str d14, [x22], #0x8\n"
+      "tbz x9, #0, 80f\n"
+      "st1 { v8.s }[2], [x26]\n"
+      "st1 { v14.s }[2], [x22]\n"
+      "b 80f\n"
+      "79:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s8, [x26, #0x0]\n"
+      "str s14, [x22, #0x0]\n"
+      "80:"  // Height 2: Partial direct writeback: Done
+      "b 82f\n"
+      "81:"  // Height 2: Full writeback
+      "str q8, [x26, #0x0]\n"
+      "str q9, [x26, #0x10]\n"
+      "str q10, [x26, #0x20]\n"
+      "str q11, [x26, #0x30]\n"
+      "str q12, [x26, #0x40]\n"
+      "str q13, [x26, #0x50]\n"
+      "add x26, x26, #0x60\n"
+      "str q14, [x22, #0x0]\n"
+      "str q15, [x22, #0x10]\n"
+      "str q16, [x22, #0x20]\n"
+      "str q17, [x22, #0x30]\n"
+      "str q18, [x22, #0x40]\n"
+      "str q19, [x22, #0x50]\n"
+      "82:"  // Height 2: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 43b\n"
+      "b 166f\n"
+      "83:"  // Height 3
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[bias]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "84:"  // Height 3: Column loop
+      "cbz x27, 85f\n"
+      "ldr q8, [x27, #0x0]\n"
+      "mov v14.16b, v8.16b\n"
+      "ldr q9, [x27, #0x10]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q10, [x27, #0x20]\n"
+      "ldr q11, [x27, #0x30]\n"
+      "mov v15.16b, v9.16b\n"
+      "ldr q12, [x27, #0x40]\n"
+      "mov v21.16b, v9.16b\n"
+      "ldr q13, [x27, #0x50]\n"
+      "add x27, x27, #0x60\n"
+      "mov v16.16b, v10.16b\n"
+      "mov v17.16b, v11.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v18.16b, v12.16b\n"
+      "mov v19.16b, v13.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "b 100f\n"
+      "85:"  // Height 3: no bias
+      "tbz %x[flags], #0, 99f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x9, #0x18\n"
+      "add x22, x26, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "bge 98f\n"
+      "tbz x9, #4, 89f\n"
+      "ld1 { v8.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v15.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v16.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "ld1 { v11.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v23.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 87f\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 86f\n"
+      "mov x19, #0x58\n"
+      "ldr d13, [x26], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "tbz x9, #0, 97f\n"
+      "ld1 { v13.s }[2], [x26]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "b 97f\n"
+      "86:"  // Height 3: Partial accumulate: partial_1_20
+      "mov x19, #0x50\n"
+      "tbz x9, #0, 97f\n"
+      "ldr s13, [x26, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "b 97f\n"
+      "87:"  // Height 3: Partial accumulate: partial_2_16
+      "tbz x9, #1, 88f\n"
+      "ldr d12, [x26], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "mov x19, #0x48\n"
+      "ldr d24, [x21], #0x8\n"
+      "tbz x9, #0, 97f\n"
+      "ld1 { v12.s }[2], [x26]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "ld1 { v24.s }[2], [x21]\n"
+      "b 97f\n"
+      "88:"  // Height 3: Partial accumulate: partial_1_16
+      "mov x19, #0x40\n"
+      "tbz x9, #0, 97f\n"
+      "ldr s12, [x26, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "ldr s24, [x21, #0x0]\n"
+      "b 97f\n"
+      "89:"  // Height 3: Partial accumulate: partial_8_0
+      "tbz x9, #3, 93f\n"
+      "ld1 { v8.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v15.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 91f\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v16.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 90f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x26], #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "tbz x9, #0, 97f\n"
+      "ld1 { v11.s }[2], [x26]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "ld1 { v23.s }[2], [x21]\n"
+      "b 97f\n"
+      "90:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x9, #0, 97f\n"
+      "ldr s11, [x26, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "ldr s23, [x21, #0x0]\n"
+      "b 97f\n"
+      "91:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x9, #1, 92f\n"
+      "ldr d10, [x26], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d22, [x21], #0x8\n"
+      "tbz x9, #0, 97f\n"
+      "ld1 { v10.s }[2], [x26]\n"
+      "ld1 { v16.s }[2], [x22]\n"
+      "ld1 { v22.s }[2], [x21]\n"
+      "b 97f\n"
+      "92:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x9, #0, 97f\n"
+      "ldr s10, [x26, #0x0]\n"
+      "ldr s16, [x22, #0x0]\n"
+      "ldr s22, [x21, #0x0]\n"
+      "b 97f\n"
+      "93:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x9, #2, 95f\n"
+      "ld1 { v8.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 94f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x26], #0x8\n"
+      "ldr d15, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
+      "tbz x9, #0, 97f\n"
+      "ld1 { v9.s }[2], [x26]\n"
+      "ld1 { v15.s }[2], [x22]\n"
+      "ld1 { v21.s }[2], [x21]\n"
+      "b 97f\n"
+      "94:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x9, #0, 97f\n"
+      "ldr s9, [x26, #0x0]\n"
+      "ldr s15, [x22, #0x0]\n"
+      "ldr s21, [x21, #0x0]\n"
+      "b 97f\n"
+      "95:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x9, #1, 96f\n"
+      "ldr d8, [x26], #0x8\n"
+      "ldr d14, [x22], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d20, [x21], #0x8\n"
+      "tbz x9, #0, 97f\n"
+      "ld1 { v8.s }[2], [x26]\n"
+      "ld1 { v14.s }[2], [x22]\n"
+      "ld1 { v20.s }[2], [x21]\n"
+      "b 97f\n"
+      "96:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s8, [x26, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s14, [x22, #0x0]\n"
+      "ldr s20, [x21, #0x0]\n"
+      "97:"  // Height 3: Partial accumulate: Done
+      "sub x26, x26, x19\n"
+      "b 100f\n"
+      "98:"  // Height 3: full accumulate
+      "ldr q8, [x26, #0x0]\n"
+      "ldr q9, [x26, #0x10]\n"
+      "ldr q10, [x26, #0x20]\n"
+      "ldr q11, [x26, #0x30]\n"
+      "ldr q12, [x26, #0x40]\n"
+      "ldr q13, [x26, #0x50]\n"
+      "ldr q14, [x22, #0x0]\n"
+      "ldr q15, [x22, #0x10]\n"
+      "ldr q16, [x22, #0x20]\n"
+      "ldr q17, [x22, #0x30]\n"
+      "ldr q18, [x22, #0x40]\n"
+      "ldr q19, [x22, #0x50]\n"
+      "ldr q20, [x21, #0x0]\n"
+      "ldr q21, [x21, #0x10]\n"
+      "ldr q22, [x21, #0x20]\n"
+      "ldr q23, [x21, #0x30]\n"
+      "ldr q24, [x21, #0x40]\n"
+      "ldr q25, [x21, #0x50]\n"
+      "b 100f\n"
+      "99:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "100:"  // Height 3: setup done
+      "mov x25, #0x0\n"
+      "101:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 102f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "cbnz x25, 103f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 103f\n"
+      "102:"  // Height 3: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "103:"  // Height 3: input setup done
+      "cmp x24, #0x4\n"
+      "blt 106f\n"
+      "ldr q0, [x23, #0x0]\n"
+      "ldr q1, [x22, #0x0]\n"
+      "cmp x24, #0x8\n"
+      "ldr q2, [x21, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "blt 105f\n"
+      "104:"  // Height 3: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "add x21, x21, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "sub x24, x24, #0x4\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "cmp x24, #0x8\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "ldr q6, [x28, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "ldr q7, [x28, #0x70]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "ldr q4, [x28, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "ldr q5, [x28, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v20.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x28, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v21.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "fmla v22.4s, v4.4s, v2.s[1]\n"
+      "ldr q4, [x28, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "fmla v23.4s, v5.4s, v2.s[1]\n"
+      "ldr q5, [x28, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "fmla v24.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x28, #0xe0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "fmla v25.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x28, #0xf0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "fmla v20.4s, v4.4s, v2.s[2]\n"
+      "ldr q4, [x28, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "fmla v21.4s, v5.4s, v2.s[2]\n"
+      "ldr q5, [x28, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "fmla v22.4s, v6.4s, v2.s[2]\n"
+      "ldr q6, [x28, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "fmla v23.4s, v7.4s, v2.s[2]\n"
+      "ldr q7, [x28, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "fmla v24.4s, v4.4s, v2.s[2]\n"
+      "ldr q4, [x28, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "fmla v25.4s, v5.4s, v2.s[2]\n"
+      "ldr q5, [x28, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v20.4s, v6.4s, v2.s[3]\n"
+      "ldr q6, [x28, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v21.4s, v7.4s, v2.s[3]\n"
+      "ldr q7, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "ldr q0, [x23, #0x0]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "ldr q1, [x22, #0x0]\n"
+      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "ldr q2, [x21, #0x0]\n"
+      "bge 104b\n"
+      "105:"  // Height 3: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "sub x24, x24, #0x4\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "add x21, x21, #0x10\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "ldr q6, [x28, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "ldr q7, [x28, #0x70]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "ldr q4, [x28, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "ldr q5, [x28, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v20.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x28, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v21.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "fmla v22.4s, v4.4s, v2.s[1]\n"
+      "ldr q4, [x28, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "fmla v23.4s, v5.4s, v2.s[1]\n"
+      "ldr q5, [x28, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "fmla v24.4s, v6.4s, v2.s[1]\n"
+      "ldr q6, [x28, #0xe0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "fmla v25.4s, v7.4s, v2.s[1]\n"
+      "ldr q7, [x28, #0xf0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "fmla v20.4s, v4.4s, v2.s[2]\n"
+      "ldr q4, [x28, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "fmla v21.4s, v5.4s, v2.s[2]\n"
+      "ldr q5, [x28, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "fmla v22.4s, v6.4s, v2.s[2]\n"
+      "ldr q6, [x28, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "fmla v23.4s, v7.4s, v2.s[2]\n"
+      "ldr q7, [x28, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "fmla v24.4s, v4.4s, v2.s[2]\n"
+      "ldr q4, [x28, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "fmla v25.4s, v5.4s, v2.s[2]\n"
+      "ldr q5, [x28, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v20.4s, v6.4s, v2.s[3]\n"
+      "ldr q6, [x28, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v21.4s, v7.4s, v2.s[3]\n"
+      "ldr q7, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "106:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x24, 108f\n"
+      "107:"  // Height 3: Multiply loop: Odd block loop
+      "ldr s0, [x23], #0x4\n"
+      "sub x24, x24, #0x1\n"
+      "ldr s1, [x22], #0x4\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr q4, [x28, #0x0]\n"
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "add x28, x28, #0x60\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "cbnz x24, 107b\n"
+      "108:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 101b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "add x22, x26, x19, LSL #2\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "tbz %x[flags], #1, 109f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "109:"  // Height 3: No activation
+      "cmp x9, #0x18\n"
+      "bge 122f\n"
+      "tbz x9, #4, 113f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "st1 { v11.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x22], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "st1 { v21.4s }, [x21], #0x10\n"
+      "st1 { v22.4s }, [x21], #0x10\n"
+      "st1 { v23.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 111f\n"
+      "st1 { v12.4s }, [x26], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 110f\n"
+      "str d13, [x26], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x9, #0, 121f\n"
+      "st1 { v13.s }[2], [x26]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "b 121f\n"
+      "110:"  // Height 3: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 121f\n"
+      "str s13, [x26, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "b 121f\n"
+      "111:"  // Height 3: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 112f\n"
+      "str d12, [x26], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x9, #0, 121f\n"
+      "st1 { v12.s }[2], [x26]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "b 121f\n"
+      "112:"  // Height 3: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 121f\n"
+      "str s12, [x26, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "b 121f\n"
+      "113:"  // Height 3: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 117f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x22], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "st1 { v21.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 115f\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v22.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 114f\n"
+      "str d11, [x26], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "str d23, [x21], #0x8\n"
+      "tbz x9, #0, 121f\n"
+      "st1 { v11.s }[2], [x26]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "st1 { v23.s }[2], [x21]\n"
+      "b 121f\n"
+      "114:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 121f\n"
+      "str s11, [x26, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "str s23, [x21, #0x0]\n"
+      "b 121f\n"
+      "115:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 116f\n"
+      "str d10, [x26], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "str d22, [x21], #0x8\n"
+      "tbz x9, #0, 121f\n"
+      "st1 { v10.s }[2], [x26]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "st1 { v22.s }[2], [x21]\n"
+      "b 121f\n"
+      "116:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 121f\n"
+      "str s10, [x26, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "str s22, [x21, #0x0]\n"
+      "b 121f\n"
+      "117:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 119f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 118f\n"
+      "str d9, [x26], #0x8\n"
+      "str d15, [x22], #0x8\n"
+      "str d21, [x21], #0x8\n"
+      "tbz x9, #0, 121f\n"
+      "st1 { v9.s }[2], [x26]\n"
+      "st1 { v15.s }[2], [x22]\n"
+      "st1 { v21.s }[2], [x21]\n"
+      "b 121f\n"
+      "118:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 121f\n"
+      "str s9, [x26, #0x0]\n"
+      "str s15, [x22, #0x0]\n"
+      "str s21, [x21, #0x0]\n"
+      "b 121f\n"
+      "119:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 120f\n"
+      "str d8, [x26], #0x8\n"
+      "str d14, [x22], #0x8\n"
+      "str d20, [x21], #0x8\n"
+      "tbz x9, #0, 121f\n"
+      "st1 { v8.s }[2], [x26]\n"
+      "st1 { v14.s }[2], [x22]\n"
+      "st1 { v20.s }[2], [x21]\n"
+      "b 121f\n"
+      "120:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s8, [x26, #0x0]\n"
+      "str s14, [x22, #0x0]\n"
+      "str s20, [x21, #0x0]\n"
+      "121:"  // Height 3: Partial direct writeback: Done
+      "b 123f\n"
+      "122:"  // Height 3: Full writeback
+      "str q8, [x26, #0x0]\n"
+      "str q9, [x26, #0x10]\n"
+      "str q10, [x26, #0x20]\n"
+      "str q11, [x26, #0x30]\n"
+      "str q12, [x26, #0x40]\n"
+      "str q13, [x26, #0x50]\n"
+      "add x26, x26, #0x60\n"
+      "str q14, [x22, #0x0]\n"
+      "str q15, [x22, #0x10]\n"
+      "str q16, [x22, #0x20]\n"
+      "str q17, [x22, #0x30]\n"
+      "str q18, [x22, #0x40]\n"
+      "str q19, [x22, #0x50]\n"
+      "str q20, [x21, #0x0]\n"
+      "str q21, [x21, #0x10]\n"
+      "str q22, [x21, #0x20]\n"
+      "str q23, [x21, #0x30]\n"
+      "str q24, [x21, #0x40]\n"
+      "str q25, [x21, #0x50]\n"
+      "123:"  // Height 3: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 84b\n"
+      "b 166f\n"
+      "124:"  // Height 4
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[bias]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x19, #0x10\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "125:"  // Height 4: Column loop
+      "cbz x27, 126f\n"
+      "ldr q8, [x27, #0x0]\n"
+      "mov v14.16b, v8.16b\n"
+      "ldr q9, [x27, #0x10]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q10, [x27, #0x20]\n"
+      "mov v26.16b, v8.16b\n"
+      "ldr q11, [x27, #0x30]\n"
+      "ldr q12, [x27, #0x40]\n"
+      "mov v15.16b, v9.16b\n"
+      "ldr q13, [x27, #0x50]\n"
+      "add x27, x27, #0x60\n"
+      "mov v16.16b, v10.16b\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v17.16b, v11.16b\n"
+      "mov v18.16b, v12.16b\n"
+      "mov v19.16b, v13.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "mov v27.16b, v9.16b\n"
+      "mov v28.16b, v10.16b\n"
+      "mov v29.16b, v11.16b\n"
+      "mov v30.16b, v12.16b\n"
+      "mov v31.16b, v13.16b\n"
+      "b 141f\n"
+      "126:"  // Height 4: no bias
+      "tbz %x[flags], #0, 140f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x9, #0x18\n"
+      "add x22, x26, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "bge 139f\n"
+      "tbz x9, #4, 130f\n"
+      "ld1 { v8.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v26.4s }, [x20], #0x10\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v15.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "ld1 { v27.4s }, [x20], #0x10\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v16.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "ld1 { v11.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v23.4s }, [x21], #0x10\n"
+      "ld1 { v29.4s }, [x20], #0x10\n"
+      "tbz x9, #2, 128f\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v30.4s }, [x20], #0x10\n"
+      "tbz x9, #1, 127f\n"
+      "mov x19, #0x58\n"
+      "ldr d13, [x26], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "ldr d31, [x20], #0x8\n"
+      "tbz x9, #0, 138f\n"
+      "ld1 { v13.s }[2], [x26]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "ld1 { v31.s }[2], [x20]\n"
+      "b 138f\n"
+      "127:"  // Height 4: Partial accumulate: partial_1_20
+      "mov x19, #0x50\n"
+      "tbz x9, #0, 138f\n"
+      "ldr s13, [x26, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "ldr s31, [x20, #0x0]\n"
+      "b 138f\n"
+      "128:"  // Height 4: Partial accumulate: partial_2_16
+      "tbz x9, #1, 129f\n"
+      "ldr d12, [x26], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "mov x19, #0x48\n"
+      "ldr d24, [x21], #0x8\n"
+      "ldr d30, [x20], #0x8\n"
+      "tbz x9, #0, 138f\n"
+      "ld1 { v12.s }[2], [x26]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "ld1 { v24.s }[2], [x21]\n"
+      "ld1 { v30.s }[2], [x20]\n"
+      "b 138f\n"
+      "129:"  // Height 4: Partial accumulate: partial_1_16
+      "mov x19, #0x40\n"
+      "tbz x9, #0, 138f\n"
+      "ldr s12, [x26, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "ldr s24, [x21, #0x0]\n"
+      "ldr s30, [x20, #0x0]\n"
+      "b 138f\n"
+      "130:"  // Height 4: Partial accumulate: partial_8_0
+      "tbz x9, #3, 134f\n"
+      "ld1 { v8.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v26.4s }, [x20], #0x10\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v15.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "ld1 { v27.4s }, [x20], #0x10\n"
+      "tbz x9, #2, 132f\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v16.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "tbz x9, #1, 131f\n"
+      "mov x19, #0x38\n"
+      "ldr d11, [x26], #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d29, [x20], #0x8\n"
+      "tbz x9, #0, 138f\n"
+      "ld1 { v11.s }[2], [x26]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "ld1 { v23.s }[2], [x21]\n"
+      "ld1 { v29.s }[2], [x20]\n"
+      "b 138f\n"
+      "131:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x9, #0, 138f\n"
+      "ldr s11, [x26, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "ldr s23, [x21, #0x0]\n"
+      "ldr s29, [x20, #0x0]\n"
+      "b 138f\n"
+      "132:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x9, #1, 133f\n"
+      "ldr d10, [x26], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d22, [x21], #0x8\n"
+      "ldr d28, [x20], #0x8\n"
+      "tbz x9, #0, 138f\n"
+      "ld1 { v10.s }[2], [x26]\n"
+      "ld1 { v16.s }[2], [x22]\n"
+      "ld1 { v22.s }[2], [x21]\n"
+      "ld1 { v28.s }[2], [x20]\n"
+      "b 138f\n"
+      "133:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x9, #0, 138f\n"
+      "ldr s10, [x26, #0x0]\n"
+      "ldr s16, [x22, #0x0]\n"
+      "ldr s22, [x21, #0x0]\n"
+      "ldr s28, [x20, #0x0]\n"
+      "b 138f\n"
+      "134:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x9, #2, 136f\n"
+      "ld1 { v8.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v26.4s }, [x20], #0x10\n"
+      "tbz x9, #1, 135f\n"
+      "mov x19, #0x18\n"
+      "ldr d9, [x26], #0x8\n"
+      "ldr d15, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
+      "ldr d27, [x20], #0x8\n"
+      "tbz x9, #0, 138f\n"
+      "ld1 { v9.s }[2], [x26]\n"
+      "ld1 { v15.s }[2], [x22]\n"
+      "ld1 { v21.s }[2], [x21]\n"
+      "ld1 { v27.s }[2], [x20]\n"
+      "b 138f\n"
+      "135:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x9, #0, 138f\n"
+      "ldr s9, [x26, #0x0]\n"
+      "ldr s15, [x22, #0x0]\n"
+      "ldr s21, [x21, #0x0]\n"
+      "ldr s27, [x20, #0x0]\n"
+      "b 138f\n"
+      "136:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x9, #1, 137f\n"
+      "ldr d8, [x26], #0x8\n"
+      "ldr d14, [x22], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d20, [x21], #0x8\n"
+      "ldr d26, [x20], #0x8\n"
+      "tbz x9, #0, 138f\n"
+      "ld1 { v8.s }[2], [x26]\n"
+      "ld1 { v14.s }[2], [x22]\n"
+      "ld1 { v20.s }[2], [x21]\n"
+      "ld1 { v26.s }[2], [x20]\n"
+      "b 138f\n"
+      "137:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s8, [x26, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s14, [x22, #0x0]\n"
+      "ldr s20, [x21, #0x0]\n"
+      "ldr s26, [x20, #0x0]\n"
+      "138:"  // Height 4: Partial accumulate: Done
+      "sub x26, x26, x19\n"
+      "b 141f\n"
+      "139:"  // Height 4: full accumulate
+      "ldr q8, [x26, #0x0]\n"
+      "ldr q9, [x26, #0x10]\n"
+      "ldr q10, [x26, #0x20]\n"
+      "ldr q11, [x26, #0x30]\n"
+      "ldr q12, [x26, #0x40]\n"
+      "ldr q13, [x26, #0x50]\n"
+      "ldr q14, [x22, #0x0]\n"
+      "ldr q15, [x22, #0x10]\n"
+      "ldr q16, [x22, #0x20]\n"
+      "ldr q17, [x22, #0x30]\n"
+      "ldr q18, [x22, #0x40]\n"
+      "ldr q19, [x22, #0x50]\n"
+      "ldr q20, [x21, #0x0]\n"
+      "ldr q21, [x21, #0x10]\n"
+      "ldr q22, [x21, #0x20]\n"
+      "ldr q23, [x21, #0x30]\n"
+      "ldr q24, [x21, #0x40]\n"
+      "ldr q25, [x21, #0x50]\n"
+      "ldr q26, [x20, #0x0]\n"
+      "ldr q27, [x20, #0x10]\n"
+      "ldr q28, [x20, #0x20]\n"
+      "ldr q29, [x20, #0x30]\n"
+      "ldr q30, [x20, #0x40]\n"
+      "ldr q31, [x20, #0x50]\n"
+      "b 141f\n"
+      "140:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "141:"  // Height 4: setup done
+      "mov x25, #0x0\n"
+      "142:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 143f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x25, 144f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 144f\n"
+      "143:"  // Height 4: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "144:"  // Height 4: input setup done
+      "cmp x24, #0x4\n"
+      "blt 147f\n"
+      "ldr q0, [x23, #0x0]\n"
+      "ldr q1, [x22, #0x0]\n"
+      "cmp x24, #0x8\n"
+      "ldr q2, [x21, #0x0]\n"
+      "ldr q3, [x20, #0x0]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "blt 146f\n"
+      "145:"  // Height 4: Multiply loop: Main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "add x21, x21, #0x10\n"
+      "fmla v26.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "sub x24, x24, #0x4\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "cmp x24, #0x8\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "fmla v27.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "fmla v28.4s, v6.4s, v3.s[0]\n"
+      "ldr q6, [x28, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "fmla v29.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x28, #0x70]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "fmla v30.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x28, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "fmla v31.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x28, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v20.4s, v6.4s, v2.s[1]\n"
+      "fmla v26.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x28, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v21.4s, v7.4s, v2.s[1]\n"
+      "fmla v27.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "fmla v22.4s, v4.4s, v2.s[1]\n"
+      "fmla v28.4s, v4.4s, v3.s[1]\n"
+      "ldr q4, [x28, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "fmla v23.4s, v5.4s, v2.s[1]\n"
+      "fmla v29.4s, v5.4s, v3.s[1]\n"
+      "ldr q5, [x28, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "fmla v24.4s, v6.4s, v2.s[1]\n"
+      "fmla v30.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x28, #0xe0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "fmla v25.4s, v7.4s, v2.s[1]\n"
+      "fmla v31.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x28, #0xf0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "fmla v20.4s, v4.4s, v2.s[2]\n"
+      "fmla v26.4s, v4.4s, v3.s[2]\n"
+      "ldr q4, [x28, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "fmla v21.4s, v5.4s, v2.s[2]\n"
+      "fmla v27.4s, v5.4s, v3.s[2]\n"
+      "ldr q5, [x28, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "fmla v22.4s, v6.4s, v2.s[2]\n"
+      "fmla v28.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x28, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "fmla v23.4s, v7.4s, v2.s[2]\n"
+      "fmla v29.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x28, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "fmla v24.4s, v4.4s, v2.s[2]\n"
+      "fmla v30.4s, v4.4s, v3.s[2]\n"
+      "ldr q4, [x28, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "fmla v25.4s, v5.4s, v2.s[2]\n"
+      "fmla v31.4s, v5.4s, v3.s[2]\n"
+      "ldr q5, [x28, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v20.4s, v6.4s, v2.s[3]\n"
+      "fmla v26.4s, v6.4s, v3.s[3]\n"
+      "ldr q6, [x28, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v21.4s, v7.4s, v2.s[3]\n"
+      "fmla v27.4s, v7.4s, v3.s[3]\n"
+      "ldr q7, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "fmla v28.4s, v4.4s, v3.s[3]\n"
+      "ldr q4, [x28, #0x0]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "fmla v29.4s, v5.4s, v3.s[3]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "fmla v30.4s, v6.4s, v3.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "ldr q0, [x23, #0x0]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "ldr q1, [x22, #0x0]\n"
+      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "ldr q2, [x21, #0x0]\n"
+      "fmla v31.4s, v7.4s, v3.s[3]\n"
+      "ldr q3, [x20, #0x0]\n"
+      "bge 145b\n"
+      "146:"  // Height 4: Multiply loop: Single iteration only
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "sub x24, x24, #0x4\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "add x23, x23, #0x10\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "add x22, x22, #0x10\n"
+      "fmla v26.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "add x21, x21, #0x10\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "fmla v27.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "fmla v28.4s, v6.4s, v3.s[0]\n"
+      "ldr q6, [x28, #0x60]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "fmla v29.4s, v7.4s, v3.s[0]\n"
+      "ldr q7, [x28, #0x70]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "fmla v30.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x28, #0x80]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "fmla v31.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x28, #0x90]\n"
+      "fmla v8.4s, v6.4s, v0.s[1]\n"
+      "fmla v14.4s, v6.4s, v1.s[1]\n"
+      "fmla v20.4s, v6.4s, v2.s[1]\n"
+      "fmla v26.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x28, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v1.s[1]\n"
+      "fmla v21.4s, v7.4s, v2.s[1]\n"
+      "fmla v27.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v1.s[1]\n"
+      "fmla v22.4s, v4.4s, v2.s[1]\n"
+      "fmla v28.4s, v4.4s, v3.s[1]\n"
+      "ldr q4, [x28, #0xc0]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "fmla v17.4s, v5.4s, v1.s[1]\n"
+      "fmla v23.4s, v5.4s, v2.s[1]\n"
+      "fmla v29.4s, v5.4s, v3.s[1]\n"
+      "ldr q5, [x28, #0xd0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v18.4s, v6.4s, v1.s[1]\n"
+      "fmla v24.4s, v6.4s, v2.s[1]\n"
+      "fmla v30.4s, v6.4s, v3.s[1]\n"
+      "ldr q6, [x28, #0xe0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v19.4s, v7.4s, v1.s[1]\n"
+      "fmla v25.4s, v7.4s, v2.s[1]\n"
+      "fmla v31.4s, v7.4s, v3.s[1]\n"
+      "ldr q7, [x28, #0xf0]\n"
+      "fmla v8.4s, v4.4s, v0.s[2]\n"
+      "fmla v14.4s, v4.4s, v1.s[2]\n"
+      "fmla v20.4s, v4.4s, v2.s[2]\n"
+      "fmla v26.4s, v4.4s, v3.s[2]\n"
+      "ldr q4, [x28, #0x100]\n"
+      "fmla v9.4s, v5.4s, v0.s[2]\n"
+      "fmla v15.4s, v5.4s, v1.s[2]\n"
+      "fmla v21.4s, v5.4s, v2.s[2]\n"
+      "fmla v27.4s, v5.4s, v3.s[2]\n"
+      "ldr q5, [x28, #0x110]\n"
+      "fmla v10.4s, v6.4s, v0.s[2]\n"
+      "fmla v16.4s, v6.4s, v1.s[2]\n"
+      "fmla v22.4s, v6.4s, v2.s[2]\n"
+      "fmla v28.4s, v6.4s, v3.s[2]\n"
+      "ldr q6, [x28, #0x120]\n"
+      "fmla v11.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v1.s[2]\n"
+      "fmla v23.4s, v7.4s, v2.s[2]\n"
+      "fmla v29.4s, v7.4s, v3.s[2]\n"
+      "ldr q7, [x28, #0x130]\n"
+      "fmla v12.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v1.s[2]\n"
+      "fmla v24.4s, v4.4s, v2.s[2]\n"
+      "fmla v30.4s, v4.4s, v3.s[2]\n"
+      "ldr q4, [x28, #0x140]\n"
+      "fmla v13.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v1.s[2]\n"
+      "fmla v25.4s, v5.4s, v2.s[2]\n"
+      "fmla v31.4s, v5.4s, v3.s[2]\n"
+      "ldr q5, [x28, #0x150]\n"
+      "fmla v8.4s, v6.4s, v0.s[3]\n"
+      "fmla v14.4s, v6.4s, v1.s[3]\n"
+      "fmla v20.4s, v6.4s, v2.s[3]\n"
+      "fmla v26.4s, v6.4s, v3.s[3]\n"
+      "ldr q6, [x28, #0x160]\n"
+      "fmla v9.4s, v7.4s, v0.s[3]\n"
+      "fmla v15.4s, v7.4s, v1.s[3]\n"
+      "fmla v21.4s, v7.4s, v2.s[3]\n"
+      "fmla v27.4s, v7.4s, v3.s[3]\n"
+      "ldr q7, [x28, #0x170]\n"
+      "add x28, x28, #0x180\n"
+      "fmla v10.4s, v4.4s, v0.s[3]\n"
+      "fmla v16.4s, v4.4s, v1.s[3]\n"
+      "fmla v22.4s, v4.4s, v2.s[3]\n"
+      "fmla v28.4s, v4.4s, v3.s[3]\n"
+      "fmla v11.4s, v5.4s, v0.s[3]\n"
+      "fmla v17.4s, v5.4s, v1.s[3]\n"
+      "fmla v23.4s, v5.4s, v2.s[3]\n"
+      "fmla v29.4s, v5.4s, v3.s[3]\n"
+      "fmla v12.4s, v6.4s, v0.s[3]\n"
+      "fmla v18.4s, v6.4s, v1.s[3]\n"
+      "fmla v24.4s, v6.4s, v2.s[3]\n"
+      "fmla v30.4s, v6.4s, v3.s[3]\n"
+      "fmla v13.4s, v7.4s, v0.s[3]\n"
+      "fmla v19.4s, v7.4s, v1.s[3]\n"
+      "fmla v25.4s, v7.4s, v2.s[3]\n"
+      "fmla v31.4s, v7.4s, v3.s[3]\n"
+      "147:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x24, 149f\n"
+      "148:"  // Height 4: Multiply loop: Odd block loop
+      "ldr s0, [x23], #0x4\n"
+      "sub x24, x24, #0x1\n"
+      "ldr s1, [x22], #0x4\n"
+      "ldr s2, [x21], #0x4\n"
+      "ldr s3, [x20], #0x4\n"
+      "ldr q4, [x28, #0x0]\n"
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      "fmla v14.4s, v4.4s, v1.s[0]\n"
+      "ldr q6, [x28, #0x20]\n"
+      "fmla v20.4s, v4.4s, v2.s[0]\n"
+      "ldr q7, [x28, #0x30]\n"
+      "fmla v26.4s, v4.4s, v3.s[0]\n"
+      "ldr q4, [x28, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v15.4s, v5.4s, v1.s[0]\n"
+      "fmla v21.4s, v5.4s, v2.s[0]\n"
+      "fmla v27.4s, v5.4s, v3.s[0]\n"
+      "ldr q5, [x28, #0x50]\n"
+      "add x28, x28, #0x60\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v16.4s, v6.4s, v1.s[0]\n"
+      "fmla v22.4s, v6.4s, v2.s[0]\n"
+      "fmla v28.4s, v6.4s, v3.s[0]\n"
+      "fmla v11.4s, v7.4s, v0.s[0]\n"
+      "fmla v17.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v2.s[0]\n"
+      "fmla v29.4s, v7.4s, v3.s[0]\n"
+      "fmla v12.4s, v4.4s, v0.s[0]\n"
+      "fmla v18.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v2.s[0]\n"
+      "fmla v30.4s, v4.4s, v3.s[0]\n"
+      "fmla v13.4s, v5.4s, v0.s[0]\n"
+      "fmla v19.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v2.s[0]\n"
+      "fmla v31.4s, v5.4s, v3.s[0]\n"
+      "cbnz x24, 148b\n"
+      "149:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 142b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "add x22, x26, x19, LSL #2\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "tbz %x[flags], #1, 150f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmin v28.4s, v28.4s, v0.4s\n"
+      "fmin v29.4s, v29.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "fmax v28.4s, v28.4s, v1.4s\n"
+      "fmax v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v0.4s\n"
+      "fmin v31.4s, v31.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v1.4s\n"
+      "fmax v31.4s, v31.4s, v1.4s\n"
+      "150:"  // Height 4: No activation
+      "cmp x9, #0x18\n"
+      "bge 163f\n"
+      "tbz x9, #4, 154f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "st1 { v11.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x22], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "st1 { v21.4s }, [x21], #0x10\n"
+      "st1 { v22.4s }, [x21], #0x10\n"
+      "st1 { v23.4s }, [x21], #0x10\n"
+      "st1 { v26.4s }, [x20], #0x10\n"
+      "st1 { v27.4s }, [x20], #0x10\n"
+      "st1 { v28.4s }, [x20], #0x10\n"
+      "st1 { v29.4s }, [x20], #0x10\n"
+      "tbz x9, #2, 152f\n"
+      "st1 { v12.4s }, [x26], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v30.4s }, [x20], #0x10\n"
+      "tbz x9, #1, 151f\n"
+      "str d13, [x26], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "str d31, [x20], #0x8\n"
+      "tbz x9, #0, 162f\n"
+      "st1 { v13.s }[2], [x26]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "st1 { v31.s }[2], [x20]\n"
+      "b 162f\n"
+      "151:"  // Height 4: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 162f\n"
+      "str s13, [x26, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "str s31, [x20, #0x0]\n"
+      "b 162f\n"
+      "152:"  // Height 4: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 153f\n"
+      "str d12, [x26], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "str d30, [x20], #0x8\n"
+      "tbz x9, #0, 162f\n"
+      "st1 { v12.s }[2], [x26]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "st1 { v30.s }[2], [x20]\n"
+      "b 162f\n"
+      "153:"  // Height 4: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 162f\n"
+      "str s12, [x26, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "str s30, [x20, #0x0]\n"
+      "b 162f\n"
+      "154:"  // Height 4: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 158f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x22], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "st1 { v21.4s }, [x21], #0x10\n"
+      "st1 { v26.4s }, [x20], #0x10\n"
+      "st1 { v27.4s }, [x20], #0x10\n"
+      "tbz x9, #2, 156f\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v22.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x20], #0x10\n"
+      "tbz x9, #1, 155f\n"
+      "str d11, [x26], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "str d23, [x21], #0x8\n"
+      "str d29, [x20], #0x8\n"
+      "tbz x9, #0, 162f\n"
+      "st1 { v11.s }[2], [x26]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "st1 { v23.s }[2], [x21]\n"
+      "st1 { v29.s }[2], [x20]\n"
+      "b 162f\n"
+      "155:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 162f\n"
+      "str s11, [x26, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "str s23, [x21, #0x0]\n"
+      "str s29, [x20, #0x0]\n"
+      "b 162f\n"
+      "156:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 157f\n"
+      "str d10, [x26], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "str d22, [x21], #0x8\n"
+      "str d28, [x20], #0x8\n"
+      "tbz x9, #0, 162f\n"
+      "st1 { v10.s }[2], [x26]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "st1 { v22.s }[2], [x21]\n"
+      "st1 { v28.s }[2], [x20]\n"
+      "b 162f\n"
+      "157:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 162f\n"
+      "str s10, [x26, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "str s22, [x21, #0x0]\n"
+      "str s28, [x20, #0x0]\n"
+      "b 162f\n"
+      "158:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 160f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "st1 { v26.4s }, [x20], #0x10\n"
+      "tbz x9, #1, 159f\n"
+      "str d9, [x26], #0x8\n"
+      "str d15, [x22], #0x8\n"
+      "str d21, [x21], #0x8\n"
+      "str d27, [x20], #0x8\n"
+      "tbz x9, #0, 162f\n"
+      "st1 { v9.s }[2], [x26]\n"
+      "st1 { v15.s }[2], [x22]\n"
+      "st1 { v21.s }[2], [x21]\n"
+      "st1 { v27.s }[2], [x20]\n"
+      "b 162f\n"
+      "159:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 162f\n"
+      "str s9, [x26, #0x0]\n"
+      "str s15, [x22, #0x0]\n"
+      "str s21, [x21, #0x0]\n"
+      "str s27, [x20, #0x0]\n"
+      "b 162f\n"
+      "160:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 161f\n"
+      "str d8, [x26], #0x8\n"
+      "str d14, [x22], #0x8\n"
+      "str d20, [x21], #0x8\n"
+      "str d26, [x20], #0x8\n"
+      "tbz x9, #0, 162f\n"
+      "st1 { v8.s }[2], [x26]\n"
+      "st1 { v14.s }[2], [x22]\n"
+      "st1 { v20.s }[2], [x21]\n"
+      "st1 { v26.s }[2], [x20]\n"
+      "b 162f\n"
+      "161:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s8, [x26, #0x0]\n"
+      "str s14, [x22, #0x0]\n"
+      "str s20, [x21, #0x0]\n"
+      "str s26, [x20, #0x0]\n"
+      "162:"  // Height 4: Partial direct writeback: Done
+      "b 164f\n"
+      "163:"  // Height 4: Full writeback
+      "str q8, [x26, #0x0]\n"
+      "str q9, [x26, #0x10]\n"
+      "str q10, [x26, #0x20]\n"
+      "str q11, [x26, #0x30]\n"
+      "str q12, [x26, #0x40]\n"
+      "str q13, [x26, #0x50]\n"
+      "add x26, x26, #0x60\n"
+      "str q14, [x22, #0x0]\n"
+      "str q15, [x22, #0x10]\n"
+      "str q16, [x22, #0x20]\n"
+      "str q17, [x22, #0x30]\n"
+      "str q18, [x22, #0x40]\n"
+      "str q19, [x22, #0x50]\n"
+      "str q20, [x21, #0x0]\n"
+      "str q21, [x21, #0x10]\n"
+      "str q22, [x21, #0x20]\n"
+      "str q23, [x21, #0x30]\n"
+      "str q24, [x21, #0x40]\n"
+      "str q25, [x21, #0x50]\n"
+      "str q26, [x20, #0x0]\n"
+      "str q27, [x20, #0x10]\n"
+      "str q28, [x20, #0x20]\n"
+      "str q29, [x20, #0x30]\n"
+      "str q30, [x20, #0x40]\n"
+      "str q31, [x20, #0x50]\n"
+      "164:"  // Height 4: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 125b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 166f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 165f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "165:"  // Update direct input
+      "mov x19, #0x10\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "166:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
index 7f83e617c5..de94e72ab0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
@@ -22,8 +22,8 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef __aarch64__
 
+#ifdef __aarch64__
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -44,7 +44,8 @@ void a64_hybrid_fp32_mla_6x16_a55( ARGLIST );
 class cls_a64_hybrid_fp32_mla_6x16
 {
 public:
-    typedef float operand_type;
+    typedef float lhs_operand_type;
+    typedef float rhs_operand_type;
     typedef float result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,20 +71,28 @@ public:
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 6, 16, 1> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 3.04 };
-            case CPUModel::A53:
-                return { 1.43 };
-            case CPUModel::A73:
-                return { 2.56 };
-            default:
-                return { 6.667 };
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 2.986 };
+                case CPUModel::A53:
+                    return { 1.43 };
+                case CPUModel::A73:
+                    return { 2.56 };
+                default:
+                    return { 6.667 };
+                case CPUModel::A510:
+                    return { 3.88 };
+                case CPUModel::V1:
+                    return { 13.72 };
+            }
         }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
@@ -104,4 +113,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
index 184cfaf95c..e8b7db21bd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp
@@ -839,14 +839,14 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "ldr q9, [x16, #0x10]\n"
       "ldr q10, [x16, #0x20]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
-      "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
-      "mov v14.16b, v10.16b\n"
-      "mov v18.16b, v10.16b\n"
       "ldr q11, [x16, #0x30]\n"
+      "mov v13.16b, v9.16b\n"
       "add x16, x16, #0x40\n"
+      "mov v14.16b, v10.16b\n"
       "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v18.16b, v10.16b\n"
       "mov v19.16b, v11.16b\n"
       "b 80f\n"
       "69:"  // Height 3: no bias
@@ -1364,18 +1364,18 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "ldr q9, [x16, #0x10]\n"
       "ldr q10, [x16, #0x20]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x16, #0x30]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "add x16, x16, #0x40\n"
       "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
       "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
       "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
-      "mov v15.16b, v11.16b\n"
-      "mov v19.16b, v11.16b\n"
       "mov v23.16b, v11.16b\n"
       "b 113f\n"
       "102:"  // Height 4: no bias
@@ -1996,22 +1996,22 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "ldr q9, [x16, #0x10]\n"
       "ldr q10, [x16, #0x20]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x16, #0x30]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "add x16, x16, #0x40\n"
       "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
       "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
       "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
       "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
-      "mov v15.16b, v11.16b\n"
-      "mov v19.16b, v11.16b\n"
-      "mov v23.16b, v11.16b\n"
       "mov v27.16b, v11.16b\n"
       "b 146f\n"
       "135:"  // Height 5: no bias
@@ -2738,26 +2738,26 @@ void a64_hybrid_fp32_mla_6x16_a55 (
       "ldr q9, [x16, #0x10]\n"
       "ldr q10, [x16, #0x20]\n"
       "mov v12.16b, v8.16b\n"
-      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x16, #0x30]\n"
       "mov v13.16b, v9.16b\n"
-      "mov v17.16b, v9.16b\n"
+      "add x16, x16, #0x40\n"
       "mov v14.16b, v10.16b\n"
+      "mov v15.16b, v11.16b\n"
+      "mov v16.16b, v8.16b\n"
+      "mov v17.16b, v9.16b\n"
       "mov v18.16b, v10.16b\n"
+      "mov v19.16b, v11.16b\n"
       "mov v20.16b, v8.16b\n"
       "mov v21.16b, v9.16b\n"
       "mov v22.16b, v10.16b\n"
+      "mov v23.16b, v11.16b\n"
       "mov v24.16b, v8.16b\n"
       "mov v25.16b, v9.16b\n"
       "mov v26.16b, v10.16b\n"
+      "mov v27.16b, v11.16b\n"
       "mov v28.16b, v8.16b\n"
       "mov v29.16b, v9.16b\n"
       "mov v30.16b, v10.16b\n"
-      "ldr q11, [x16, #0x30]\n"
-      "add x16, x16, #0x40\n"
-      "mov v15.16b, v11.16b\n"
-      "mov v19.16b, v11.16b\n"
-      "mov v23.16b, v11.16b\n"
-      "mov v27.16b, v11.16b\n"
       "mov v31.16b, v11.16b\n"
       "b 179f\n"
       "168:"  // Height 6: no bias
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
index f5504b44d4..28e9be4cb7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
@@ -1893,8 +1893,8 @@ void a64_hybrid_fp32_mla_6x16 (
       "ld1 { v22.4s }, [x22], #0x10\n"
       "ld1 { v26.4s }, [x21], #0x10\n"
       "tbz x11, #1, 136f\n"
-      "mov x19, #0x38\n"
       "ldr d11, [x28], #0x8\n"
+      "mov x19, #0x38\n"
       "ldr d15, [x24], #0x8\n"
       "ldr d19, [x23], #0x8\n"
       "ldr d23, [x22], #0x8\n"
@@ -1947,8 +1947,8 @@ void a64_hybrid_fp32_mla_6x16 (
       "ld1 { v20.4s }, [x22], #0x10\n"
       "ld1 { v24.4s }, [x21], #0x10\n"
       "tbz x11, #1, 140f\n"
-      "mov x19, #0x18\n"
       "ldr d9, [x28], #0x8\n"
+      "mov x19, #0x18\n"
       "ldr d13, [x24], #0x8\n"
       "ldr d17, [x23], #0x8\n"
       "ldr d21, [x22], #0x8\n"
@@ -2586,12 +2586,12 @@ void a64_hybrid_fp32_mla_6x16 (
       "ld1 { v16.4s }, [x23], #0x10\n"
       "ld1 { v20.4s }, [x22], #0x10\n"
       "ld1 { v24.4s }, [x21], #0x10\n"
-      "ld1 { v28.4s }, [x20], #0x10\n"
       "ld1 { v9.4s }, [x28], #0x10\n"
       "ld1 { v13.4s }, [x24], #0x10\n"
       "ld1 { v17.4s }, [x23], #0x10\n"
       "ld1 { v21.4s }, [x22], #0x10\n"
       "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
       "ld1 { v29.4s }, [x20], #0x10\n"
       "tbz x11, #2, 170f\n"
       "ld1 { v10.4s }, [x28], #0x10\n"
@@ -2601,8 +2601,8 @@ void a64_hybrid_fp32_mla_6x16 (
       "ld1 { v26.4s }, [x21], #0x10\n"
       "ld1 { v30.4s }, [x20], #0x10\n"
       "tbz x11, #1, 169f\n"
-      "mov x19, #0x38\n"
       "ldr d11, [x28], #0x8\n"
+      "mov x19, #0x38\n"
       "ldr d15, [x24], #0x8\n"
       "ldr d19, [x23], #0x8\n"
       "ldr d23, [x22], #0x8\n"
@@ -2662,8 +2662,8 @@ void a64_hybrid_fp32_mla_6x16 (
       "ld1 { v24.4s }, [x21], #0x10\n"
       "ld1 { v28.4s }, [x20], #0x10\n"
       "tbz x11, #1, 173f\n"
-      "mov x19, #0x18\n"
       "ldr d9, [x28], #0x8\n"
+      "mov x19, #0x18\n"
       "ldr d13, [x24], #0x8\n"
       "ldr d17, [x23], #0x8\n"
       "ldr d21, [x22], #0x8\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
index 957754ad68..4fad58a83d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
@@ -22,8 +22,8 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef __aarch64__
 
+#ifdef __aarch64__
 #include "../std_transforms_fixed.hpp"
 
 #define ARGLIST  \
@@ -43,7 +43,8 @@ void a64_hybrid_fp32_mla_8x4_a55( ARGLIST );
 class cls_a64_hybrid_fp32_mla_8x4
 {
 public:
-    typedef float operand_type;
+    typedef float lhs_operand_type;
+    typedef float rhs_operand_type;
     typedef float result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -69,7 +70,7 @@ public:
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
+    StdTransformsFixed<rhs_operand_type, result_type, 8, 4, 1> transforms = {};
 
     // Default to the generic kernel
     kern_type kernel=a64_hybrid_fp32_mla_8x4;
@@ -89,4 +90,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
new file mode 100644
index 0000000000..090dd5855e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24.hpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_fp32bf16fp32_mmla_4x24( ARGLIST );
+
+class cls_a64_hybrid_fp32bf16fp32_mmla_4x24
+{
+public:
+    typedef float lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return 24;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 4, 24, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 18.9 };
+                case CPUModel::A510:
+                    return { 6.81 };
+                case CPUModel::V1:
+                    return { 28.40 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_fp32bf16fp32_mmla_4x24;
+    cls_a64_hybrid_fp32bf16fp32_mmla_4x24(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp
new file mode 100644
index 0000000000..76c2688291
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp
@@ -0,0 +1,2426 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32bf16fp32_mmla_4x24 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 130f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 87f\n"
+      "beq 44f\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[bias]\n"
+      "mov x26, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "cbz x27, 3f\n"
+      "ldr q8, [x27, #0x0]\n"
+      "zip2 v14.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x27, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x27, #0x20]\n"
+      "ldr q11, [x27, #0x30]\n"
+      "zip2 v15.2d, v9.2d, v9.2d\n"
+      "ldr q12, [x27, #0x40]\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "ldr q13, [x27, #0x50]\n"
+      "add x27, x27, #0x60\n"
+      "zip2 v16.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v17.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "zip2 v18.2d, v12.2d, v12.2d\n"
+      "zip1 v12.2d, v12.2d, v12.2d\n"
+      "zip2 v19.2d, v13.2d, v13.2d\n"
+      "zip1 v13.2d, v13.2d, v13.2d\n"
+      "b 19f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 18f\n"
+      "cmp x9, #0x18\n"
+      "bge 16f\n"
+      "tbz x9, #4, 7f\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v11.4s }, [x26], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "tbz x9, #2, 5f\n"
+      "ld1 { v13.4s }, [x26], #0x10\n"
+      "tbz x9, #1, 4f\n"
+      "mov x19, #0x58\n"
+      "ldr d20, [x26], #0x8\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v20.s }[2], [x26]\n"
+      "b 15f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_20
+      "mov x19, #0x50\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s20, [x26, #0x0]\n"
+      "b 15f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_16
+      "tbz x9, #1, 6f\n"
+      "ldr d13, [x26], #0x8\n"
+      "mov x19, #0x48\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v13.s }[2], [x26]\n"
+      "b 15f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_16
+      "mov x19, #0x40\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s13, [x26, #0x0]\n"
+      "b 15f\n"
+      "7:"  // Height 1: Partial accumulate: partial_8_0
+      "tbz x9, #3, 11f\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "tbz x9, #2, 9f\n"
+      "ld1 { v11.4s }, [x26], #0x10\n"
+      "tbz x9, #1, 8f\n"
+      "mov x19, #0x38\n"
+      "ldr d12, [x26], #0x8\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v12.s }[2], [x26]\n"
+      "b 15f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s12, [x26, #0x0]\n"
+      "b 15f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x9, #1, 10f\n"
+      "ldr d11, [x26], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v11.s }[2], [x26]\n"
+      "b 15f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s11, [x26, #0x0]\n"
+      "b 15f\n"
+      "11:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x9, #2, 13f\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "tbz x9, #1, 12f\n"
+      "ldr d10, [x26], #0x8\n"
+      "mov x19, #0x18\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v10.s }[2], [x26]\n"
+      "b 15f\n"
+      "12:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x9, #0, 15f\n"
+      "ldr s10, [x26, #0x0]\n"
+      "b 15f\n"
+      "13:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x9, #1, 14f\n"
+      "ldr d9, [x26], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x9, #0, 15f\n"
+      "ld1 { v9.s }[2], [x26]\n"
+      "b 15f\n"
+      "14:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s9, [x26, #0x0]\n"
+      "mov x19, #0x0\n"
+      "15:"  // Height 1: Partial accumulate: Done
+      "sub x26, x26, x19\n"
+      "b 17f\n"
+      "16:"  // Height 1: full accumulate
+      "ldr q9, [x26, #0x0]\n"
+      "ldr q10, [x26, #0x10]\n"
+      "ldr q11, [x26, #0x20]\n"
+      "ldr q12, [x26, #0x30]\n"
+      "ldr q13, [x26, #0x40]\n"
+      "ldr q20, [x26, #0x50]\n"
+      "17:"  // Height 1: MMLA fixup
+      "zip1 v8.2d, v9.2d, v14.2d\n"
+      "zip2 v14.2d, v9.2d, v14.2d\n"
+      "zip1 v9.2d, v10.2d, v15.2d\n"
+      "zip2 v15.2d, v10.2d, v15.2d\n"
+      "zip1 v10.2d, v11.2d, v16.2d\n"
+      "zip2 v16.2d, v11.2d, v16.2d\n"
+      "zip1 v11.2d, v12.2d, v17.2d\n"
+      "zip2 v17.2d, v12.2d, v17.2d\n"
+      "zip1 v12.2d, v13.2d, v18.2d\n"
+      "zip2 v18.2d, v13.2d, v18.2d\n"
+      "zip1 v13.2d, v20.2d, v19.2d\n"
+      "zip2 v19.2d, v20.2d, v19.2d\n"
+      "b 19f\n"
+      "18:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "19:"  // Height 1: setup done
+      "mov x25, #0x0\n"
+      "20:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 21f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "cbnz x25, 22f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 22f\n"
+      "21:"  // Height 1: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "22:"  // Height 1: input setup done
+      "cmp x24, #0x4\n"
+      "blt 25f\n"
+      "ld1 { v0.4s }, [x23], #0x10\n"
+      "cmp x24, #0x8\n"
+      "blt 24f\n"
+      "23:"  // Height 1: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q4, [x28, #0x0]\n"
+      "sub x24, x24, #0x4\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q5, [x28, #0x10]\n"
+      "cmp x24, #0x8\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q4, [x28, #0x40]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ldr q5, [x28, #0x50]\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "ldr q7, [x28, #0x70]\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      "ldr q5, [x28, #0x90]\n"
+      "ldr q6, [x28, #0xa0]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x23], #0x10\n"
+      "bge 23b\n"
+      "24:"  // Height 1: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q4, [x28, #0x0]\n"
+      "sub x24, x24, #0x4\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q4, [x28, #0x40]\n"
+      "ldr q5, [x28, #0x50]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ldr q6, [x28, #0x60]\n"
+      "ldr q7, [x28, #0x70]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      "ldr q4, [x28, #0x80]\n"
+      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x28, #0xa0]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      "25:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x24, 28f\n"
+      "cbz x24, 28f\n"
+      "tbz x24, #1, 26f\n"
+      "ldr d0, [x23], #0x8\n"
+      "tbz x24, #0, 27f\n"
+      "ld1 { v0.s }[2], [x23]\n"
+      "b 27f\n"
+      "26:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x23, #0x0]\n"
+      "27:"  // Height 1: Multiply loop: Ragged operand read: Done
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q4, [x28, #0x40]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q5, [x28, #0x50]\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "ldr q7, [x28, #0x70]\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x28, #0xa0]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      "28:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 20b\n"
+      "uzp1 v8.2d, v8.2d, v14.2d\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "uzp1 v9.2d, v9.2d, v15.2d\n"
+      "uzp1 v10.2d, v10.2d, v16.2d\n"
+      "uzp1 v11.2d, v11.2d, v17.2d\n"
+      "uzp1 v12.2d, v12.2d, v18.2d\n"
+      "uzp1 v13.2d, v13.2d, v19.2d\n"
+      "tbz %x[flags], #1, 29f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "29:"  // Height 1: No activation
+      "cmp x9, #0x18\n"
+      "bge 42f\n"
+      "tbz x9, #4, 33f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "st1 { v11.4s }, [x26], #0x10\n"
+      "tbz x9, #2, 31f\n"
+      "st1 { v12.4s }, [x26], #0x10\n"
+      "tbz x9, #1, 30f\n"
+      "str d13, [x26], #0x8\n"
+      "tbz x9, #0, 41f\n"
+      "st1 { v13.s }[2], [x26]\n"
+      "b 41f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 41f\n"
+      "str s13, [x26, #0x0]\n"
+      "b 41f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 32f\n"
+      "str d12, [x26], #0x8\n"
+      "tbz x9, #0, 41f\n"
+      "st1 { v12.s }[2], [x26]\n"
+      "b 41f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 41f\n"
+      "str s12, [x26, #0x0]\n"
+      "b 41f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 37f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "st1 { v9.4s }, [x26], #0x10\n"
+      "tbz x9, #2, 35f\n"
+      "st1 { v10.4s }, [x26], #0x10\n"
+      "tbz x9, #1, 34f\n"
+      "str d11, [x26], #0x8\n"
+      "tbz x9, #0, 41f\n"
+      "st1 { v11.s }[2], [x26]\n"
+      "b 41f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 41f\n"
+      "str s11, [x26, #0x0]\n"
+      "b 41f\n"
+      "35:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 36f\n"
+      "str d10, [x26], #0x8\n"
+      "tbz x9, #0, 41f\n"
+      "st1 { v10.s }[2], [x26]\n"
+      "b 41f\n"
+      "36:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 41f\n"
+      "str s10, [x26, #0x0]\n"
+      "b 41f\n"
+      "37:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 39f\n"
+      "st1 { v8.4s }, [x26], #0x10\n"
+      "tbz x9, #1, 38f\n"
+      "str d9, [x26], #0x8\n"
+      "tbz x9, #0, 41f\n"
+      "st1 { v9.s }[2], [x26]\n"
+      "b 41f\n"
+      "38:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 41f\n"
+      "str s9, [x26, #0x0]\n"
+      "b 41f\n"
+      "39:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 40f\n"
+      "str d8, [x26], #0x8\n"
+      "tbz x9, #0, 41f\n"
+      "st1 { v8.s }[2], [x26]\n"
+      "b 41f\n"
+      "40:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x26, #0x0]\n"
+      "41:"  // Height 1: Partial direct writeback: Done
+      "b 43f\n"
+      "42:"  // Height 1: Full writeback
+      "str q8, [x26, #0x0]\n"
+      "str q9, [x26, #0x10]\n"
+      "str q10, [x26, #0x20]\n"
+      "str q11, [x26, #0x30]\n"
+      "str q12, [x26, #0x40]\n"
+      "str q13, [x26, #0x50]\n"
+      "add x26, x26, #0x60\n"
+      "43:"  // Height 1: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 2b\n"
+      "b 174f\n"
+      "44:"  // Height 2
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[bias]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "45:"  // Height 2: Column loop
+      "cbz x27, 46f\n"
+      "ldr q8, [x27, #0x0]\n"
+      "zip2 v14.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x27, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x27, #0x20]\n"
+      "ldr q11, [x27, #0x30]\n"
+      "zip2 v15.2d, v9.2d, v9.2d\n"
+      "ldr q12, [x27, #0x40]\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "ldr q13, [x27, #0x50]\n"
+      "add x27, x27, #0x60\n"
+      "zip2 v16.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v17.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "zip2 v18.2d, v12.2d, v12.2d\n"
+      "zip1 v12.2d, v12.2d, v12.2d\n"
+      "zip2 v19.2d, v13.2d, v13.2d\n"
+      "zip1 v13.2d, v13.2d, v13.2d\n"
+      "b 62f\n"
+      "46:"  // Height 2: no bias
+      "tbz %x[flags], #0, 61f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x9, #0x18\n"
+      "add x22, x26, x19, LSL #2\n"
+      "bge 59f\n"
+      "tbz x9, #4, 50f\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v15.4s }, [x22], #0x10\n"
+      "ld1 { v11.4s }, [x26], #0x10\n"
+      "ld1 { v16.4s }, [x22], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 48f\n"
+      "ld1 { v13.4s }, [x26], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 47f\n"
+      "mov x19, #0x58\n"
+      "ldr d20, [x26], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "tbz x9, #0, 58f\n"
+      "ld1 { v20.s }[2], [x26]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "b 58f\n"
+      "47:"  // Height 2: Partial accumulate: partial_1_20
+      "mov x19, #0x50\n"
+      "tbz x9, #0, 58f\n"
+      "ldr s20, [x26, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "b 58f\n"
+      "48:"  // Height 2: Partial accumulate: partial_2_16
+      "tbz x9, #1, 49f\n"
+      "ldr d13, [x26], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "mov x19, #0x48\n"
+      "tbz x9, #0, 58f\n"
+      "ld1 { v13.s }[2], [x26]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "b 58f\n"
+      "49:"  // Height 2: Partial accumulate: partial_1_16
+      "mov x19, #0x40\n"
+      "tbz x9, #0, 58f\n"
+      "ldr s13, [x26, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "b 58f\n"
+      "50:"  // Height 2: Partial accumulate: partial_8_0
+      "tbz x9, #3, 54f\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v15.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 52f\n"
+      "ld1 { v11.4s }, [x26], #0x10\n"
+      "ld1 { v16.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 51f\n"
+      "mov x19, #0x38\n"
+      "ldr d12, [x26], #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "tbz x9, #0, 58f\n"
+      "ld1 { v12.s }[2], [x26]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "b 58f\n"
+      "51:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x9, #0, 58f\n"
+      "ldr s12, [x26, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "b 58f\n"
+      "52:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x9, #1, 53f\n"
+      "ldr d11, [x26], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x9, #0, 58f\n"
+      "ld1 { v11.s }[2], [x26]\n"
+      "ld1 { v16.s }[2], [x22]\n"
+      "b 58f\n"
+      "53:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x9, #0, 58f\n"
+      "ldr s11, [x26, #0x0]\n"
+      "ldr s16, [x22, #0x0]\n"
+      "b 58f\n"
+      "54:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x9, #2, 56f\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 55f\n"
+      "mov x19, #0x18\n"
+      "ldr d10, [x26], #0x8\n"
+      "ldr d15, [x22], #0x8\n"
+      "tbz x9, #0, 58f\n"
+      "ld1 { v10.s }[2], [x26]\n"
+      "ld1 { v15.s }[2], [x22]\n"
+      "b 58f\n"
+      "55:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x9, #0, 58f\n"
+      "ldr s10, [x26, #0x0]\n"
+      "ldr s15, [x22, #0x0]\n"
+      "b 58f\n"
+      "56:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x9, #1, 57f\n"
+      "ldr d9, [x26], #0x8\n"
+      "ldr d14, [x22], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x9, #0, 58f\n"
+      "ld1 { v9.s }[2], [x26]\n"
+      "ld1 { v14.s }[2], [x22]\n"
+      "b 58f\n"
+      "57:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s9, [x26, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s14, [x22, #0x0]\n"
+      "58:"  // Height 2: Partial accumulate: Done
+      "sub x26, x26, x19\n"
+      "b 60f\n"
+      "59:"  // Height 2: full accumulate
+      "ldr q9, [x26, #0x0]\n"
+      "ldr q10, [x26, #0x10]\n"
+      "ldr q11, [x26, #0x20]\n"
+      "ldr q12, [x26, #0x30]\n"
+      "ldr q13, [x26, #0x40]\n"
+      "ldr q20, [x26, #0x50]\n"
+      "ldr q14, [x22, #0x0]\n"
+      "ldr q15, [x22, #0x10]\n"
+      "ldr q16, [x22, #0x20]\n"
+      "ldr q17, [x22, #0x30]\n"
+      "ldr q18, [x22, #0x40]\n"
+      "ldr q19, [x22, #0x50]\n"
+      "60:"  // Height 2: MMLA fixup
+      "zip1 v8.2d, v9.2d, v14.2d\n"
+      "zip2 v14.2d, v9.2d, v14.2d\n"
+      "zip1 v9.2d, v10.2d, v15.2d\n"
+      "zip2 v15.2d, v10.2d, v15.2d\n"
+      "zip1 v10.2d, v11.2d, v16.2d\n"
+      "zip2 v16.2d, v11.2d, v16.2d\n"
+      "zip1 v11.2d, v12.2d, v17.2d\n"
+      "zip2 v17.2d, v12.2d, v17.2d\n"
+      "zip1 v12.2d, v13.2d, v18.2d\n"
+      "zip2 v18.2d, v13.2d, v18.2d\n"
+      "zip1 v13.2d, v20.2d, v19.2d\n"
+      "zip2 v19.2d, v20.2d, v19.2d\n"
+      "b 62f\n"
+      "61:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "62:"  // Height 2: setup done
+      "mov x25, #0x0\n"
+      "63:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 64f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "cbnz x25, 65f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 65f\n"
+      "64:"  // Height 2: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "65:"  // Height 2: input setup done
+      "cmp x24, #0x4\n"
+      "blt 68f\n"
+      "ld1 { v0.4s }, [x23], #0x10\n"
+      "cmp x24, #0x8\n"
+      "blt 67f\n"
+      "66:"  // Height 2: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x22], #0x10\n"
+      "sub x24, x24, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ldr q4, [x28, #0x0]\n"
+      "cmp x24, #0x8\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q4, [x28, #0x40]\n"
+      "ldr q5, [x28, #0x50]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ldr q6, [x28, #0x60]\n"
+      "ldr q7, [x28, #0x70]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      "ldr q4, [x28, #0x80]\n"
+      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x28, #0xa0]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x23], #0x10\n"
+      "bge 66b\n"
+      "67:"  // Height 2: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x22], #0x10\n"
+      "sub x24, x24, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q7, [x28, #0x30]\n"
+      "ldr q4, [x28, #0x40]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q5, [x28, #0x50]\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "ldr q7, [x28, #0x70]\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x28, #0xa0]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      "68:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x24, 71f\n"
+      "cbz x24, 71f\n"
+      "tbz x24, #1, 69f\n"
+      "ldr d0, [x23], #0x8\n"
+      "ldr d1, [x22], #0x8\n"
+      "tbz x24, #0, 70f\n"
+      "ld1 { v0.s }[2], [x23]\n"
+      "ld1 { v1.s }[2], [x22]\n"
+      "b 70f\n"
+      "69:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x23, #0x0]\n"
+      "ldr s1, [x22, #0x0]\n"
+      "70:"  // Height 2: Multiply loop: Ragged operand read: Done
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q4, [x28, #0x40]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q5, [x28, #0x50]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x28, #0x70]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x28, #0xa0]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      "71:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 63b\n"
+      "uzp1 v4.2d, v8.2d, v14.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v14.2d\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "add x22, x26, x19, LSL #2\n"
+      "uzp1 v14.2d, v9.2d, v15.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v15.2d\n"
+      "uzp1 v15.2d, v10.2d, v16.2d\n"
+      "uzp2 v10.2d, v10.2d, v16.2d\n"
+      "uzp1 v16.2d, v11.2d, v17.2d\n"
+      "uzp2 v11.2d, v11.2d, v17.2d\n"
+      "uzp1 v17.2d, v12.2d, v18.2d\n"
+      "uzp2 v12.2d, v12.2d, v18.2d\n"
+      "uzp1 v18.2d, v13.2d, v19.2d\n"
+      "uzp2 v13.2d, v13.2d, v19.2d\n"
+      "tbz %x[flags], #1, 72f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v4.4s, v4.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmax v4.4s, v4.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "72:"  // Height 2: No activation
+      "cmp x9, #0x18\n"
+      "bge 85f\n"
+      "tbz x9, #4, 76f\n"
+      "st1 { v4.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x26], #0x10\n"
+      "st1 { v15.4s }, [x26], #0x10\n"
+      "st1 { v16.4s }, [x26], #0x10\n"
+      "st1 { v8.4s }, [x22], #0x10\n"
+      "st1 { v9.4s }, [x22], #0x10\n"
+      "st1 { v10.4s }, [x22], #0x10\n"
+      "st1 { v11.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 74f\n"
+      "st1 { v17.4s }, [x26], #0x10\n"
+      "st1 { v12.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 73f\n"
+      "str d18, [x26], #0x8\n"
+      "str d13, [x22], #0x8\n"
+      "tbz x9, #0, 84f\n"
+      "st1 { v18.s }[2], [x26]\n"
+      "st1 { v13.s }[2], [x22]\n"
+      "b 84f\n"
+      "73:"  // Height 2: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 84f\n"
+      "str s18, [x26, #0x0]\n"
+      "str s13, [x22, #0x0]\n"
+      "b 84f\n"
+      "74:"  // Height 2: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 75f\n"
+      "str d17, [x26], #0x8\n"
+      "str d12, [x22], #0x8\n"
+      "tbz x9, #0, 84f\n"
+      "st1 { v17.s }[2], [x26]\n"
+      "st1 { v12.s }[2], [x22]\n"
+      "b 84f\n"
+      "75:"  // Height 2: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 84f\n"
+      "str s17, [x26, #0x0]\n"
+      "str s12, [x22, #0x0]\n"
+      "b 84f\n"
+      "76:"  // Height 2: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 80f\n"
+      "st1 { v4.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x26], #0x10\n"
+      "st1 { v8.4s }, [x22], #0x10\n"
+      "st1 { v9.4s }, [x22], #0x10\n"
+      "tbz x9, #2, 78f\n"
+      "st1 { v15.4s }, [x26], #0x10\n"
+      "st1 { v10.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 77f\n"
+      "str d16, [x26], #0x8\n"
+      "str d11, [x22], #0x8\n"
+      "tbz x9, #0, 84f\n"
+      "st1 { v16.s }[2], [x26]\n"
+      "st1 { v11.s }[2], [x22]\n"
+      "b 84f\n"
+      "77:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 84f\n"
+      "str s16, [x26, #0x0]\n"
+      "str s11, [x22, #0x0]\n"
+      "b 84f\n"
+      "78:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 79f\n"
+      "str d15, [x26], #0x8\n"
+      "str d10, [x22], #0x8\n"
+      "tbz x9, #0, 84f\n"
+      "st1 { v15.s }[2], [x26]\n"
+      "st1 { v10.s }[2], [x22]\n"
+      "b 84f\n"
+      "79:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 84f\n"
+      "str s15, [x26, #0x0]\n"
+      "str s10, [x22, #0x0]\n"
+      "b 84f\n"
+      "80:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 82f\n"
+      "st1 { v4.4s }, [x26], #0x10\n"
+      "st1 { v8.4s }, [x22], #0x10\n"
+      "tbz x9, #1, 81f\n"
+      "str d14, [x26], #0x8\n"
+      "str d9, [x22], #0x8\n"
+      "tbz x9, #0, 84f\n"
+      "st1 { v14.s }[2], [x26]\n"
+      "st1 { v9.s }[2], [x22]\n"
+      "b 84f\n"
+      "81:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 84f\n"
+      "str s14, [x26, #0x0]\n"
+      "str s9, [x22, #0x0]\n"
+      "b 84f\n"
+      "82:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 83f\n"
+      "str d4, [x26], #0x8\n"
+      "str d8, [x22], #0x8\n"
+      "tbz x9, #0, 84f\n"
+      "st1 { v4.s }[2], [x26]\n"
+      "st1 { v8.s }[2], [x22]\n"
+      "b 84f\n"
+      "83:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s4, [x26, #0x0]\n"
+      "str s8, [x22, #0x0]\n"
+      "84:"  // Height 2: Partial direct writeback: Done
+      "b 86f\n"
+      "85:"  // Height 2: Full writeback
+      "str q4, [x26, #0x0]\n"
+      "str q14, [x26, #0x10]\n"
+      "str q15, [x26, #0x20]\n"
+      "str q16, [x26, #0x30]\n"
+      "str q17, [x26, #0x40]\n"
+      "str q18, [x26, #0x50]\n"
+      "add x26, x26, #0x60\n"
+      "str q8, [x22, #0x0]\n"
+      "str q9, [x22, #0x10]\n"
+      "str q10, [x22, #0x20]\n"
+      "str q11, [x22, #0x30]\n"
+      "str q12, [x22, #0x40]\n"
+      "str q13, [x22, #0x50]\n"
+      "86:"  // Height 2: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 45b\n"
+      "b 174f\n"
+      "87:"  // Height 3
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[bias]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "88:"  // Height 3: Column loop
+      "cbz x27, 89f\n"
+      "ldr q8, [x27, #0x0]\n"
+      "zip2 v14.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x27, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x27, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x27, #0x30]\n"
+      "mov v26.16b, v14.16b\n"
+      "ldr q12, [x27, #0x40]\n"
+      "ldr q13, [x27, #0x50]\n"
+      "zip2 v15.2d, v9.2d, v9.2d\n"
+      "add x27, x27, #0x60\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v16.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v17.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "zip2 v18.2d, v12.2d, v12.2d\n"
+      "zip1 v12.2d, v12.2d, v12.2d\n"
+      "zip2 v19.2d, v13.2d, v13.2d\n"
+      "zip1 v13.2d, v13.2d, v13.2d\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v27.16b, v15.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v28.16b, v16.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v29.16b, v17.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v30.16b, v18.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "mov v31.16b, v19.16b\n"
+      "b 105f\n"
+      "89:"  // Height 3: no bias
+      "tbz %x[flags], #0, 104f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x9, #0x18\n"
+      "add x22, x26, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "bge 102f\n"
+      "tbz x9, #4, 93f\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v15.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "ld1 { v11.4s }, [x26], #0x10\n"
+      "ld1 { v16.4s }, [x22], #0x10\n"
+      "ld1 { v23.4s }, [x21], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 91f\n"
+      "ld1 { v13.4s }, [x26], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 90f\n"
+      "mov x19, #0x58\n"
+      "ldr d20, [x26], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d4, [x21], #0x8\n"
+      "tbz x9, #0, 101f\n"
+      "ld1 { v20.s }[2], [x26]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "ld1 { v4.s }[2], [x21]\n"
+      "b 101f\n"
+      "90:"  // Height 3: Partial accumulate: partial_1_20
+      "mov x19, #0x50\n"
+      "tbz x9, #0, 101f\n"
+      "ldr s20, [x26, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "ldr s4, [x21, #0x0]\n"
+      "b 101f\n"
+      "91:"  // Height 3: Partial accumulate: partial_2_16
+      "tbz x9, #1, 92f\n"
+      "ldr d13, [x26], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "mov x19, #0x48\n"
+      "ldr d25, [x21], #0x8\n"
+      "tbz x9, #0, 101f\n"
+      "ld1 { v13.s }[2], [x26]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "b 101f\n"
+      "92:"  // Height 3: Partial accumulate: partial_1_16
+      "mov x19, #0x40\n"
+      "tbz x9, #0, 101f\n"
+      "ldr s13, [x26, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "b 101f\n"
+      "93:"  // Height 3: Partial accumulate: partial_8_0
+      "tbz x9, #3, 97f\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v15.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 95f\n"
+      "ld1 { v11.4s }, [x26], #0x10\n"
+      "ld1 { v16.4s }, [x22], #0x10\n"
+      "ld1 { v23.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 94f\n"
+      "mov x19, #0x38\n"
+      "ldr d12, [x26], #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d24, [x21], #0x8\n"
+      "tbz x9, #0, 101f\n"
+      "ld1 { v12.s }[2], [x26]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "ld1 { v24.s }[2], [x21]\n"
+      "b 101f\n"
+      "94:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x9, #0, 101f\n"
+      "ldr s12, [x26, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "ldr s24, [x21, #0x0]\n"
+      "b 101f\n"
+      "95:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x9, #1, 96f\n"
+      "ldr d11, [x26], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d23, [x21], #0x8\n"
+      "tbz x9, #0, 101f\n"
+      "ld1 { v11.s }[2], [x26]\n"
+      "ld1 { v16.s }[2], [x22]\n"
+      "ld1 { v23.s }[2], [x21]\n"
+      "b 101f\n"
+      "96:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x9, #0, 101f\n"
+      "ldr s11, [x26, #0x0]\n"
+      "ldr s16, [x22, #0x0]\n"
+      "ldr s23, [x21, #0x0]\n"
+      "b 101f\n"
+      "97:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x9, #2, 99f\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 98f\n"
+      "mov x19, #0x18\n"
+      "ldr d10, [x26], #0x8\n"
+      "ldr d15, [x22], #0x8\n"
+      "ldr d22, [x21], #0x8\n"
+      "tbz x9, #0, 101f\n"
+      "ld1 { v10.s }[2], [x26]\n"
+      "ld1 { v15.s }[2], [x22]\n"
+      "ld1 { v22.s }[2], [x21]\n"
+      "b 101f\n"
+      "98:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x9, #0, 101f\n"
+      "ldr s10, [x26, #0x0]\n"
+      "ldr s15, [x22, #0x0]\n"
+      "ldr s22, [x21, #0x0]\n"
+      "b 101f\n"
+      "99:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x9, #1, 100f\n"
+      "ldr d9, [x26], #0x8\n"
+      "ldr d14, [x22], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d21, [x21], #0x8\n"
+      "tbz x9, #0, 101f\n"
+      "ld1 { v9.s }[2], [x26]\n"
+      "ld1 { v14.s }[2], [x22]\n"
+      "ld1 { v21.s }[2], [x21]\n"
+      "b 101f\n"
+      "100:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s9, [x26, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s14, [x22, #0x0]\n"
+      "ldr s21, [x21, #0x0]\n"
+      "101:"  // Height 3: Partial accumulate: Done
+      "sub x26, x26, x19\n"
+      "b 103f\n"
+      "102:"  // Height 3: full accumulate
+      "ldr q9, [x26, #0x0]\n"
+      "ldr q10, [x26, #0x10]\n"
+      "ldr q11, [x26, #0x20]\n"
+      "ldr q12, [x26, #0x30]\n"
+      "ldr q13, [x26, #0x40]\n"
+      "ldr q20, [x26, #0x50]\n"
+      "ldr q14, [x22, #0x0]\n"
+      "ldr q15, [x22, #0x10]\n"
+      "ldr q16, [x22, #0x20]\n"
+      "ldr q17, [x22, #0x30]\n"
+      "ldr q18, [x22, #0x40]\n"
+      "ldr q19, [x22, #0x50]\n"
+      "ldr q21, [x21, #0x0]\n"
+      "ldr q22, [x21, #0x10]\n"
+      "ldr q23, [x21, #0x20]\n"
+      "ldr q24, [x21, #0x30]\n"
+      "ldr q25, [x21, #0x40]\n"
+      "ldr q4, [x21, #0x50]\n"
+      "103:"  // Height 3: MMLA fixup
+      "zip1 v8.2d, v9.2d, v14.2d\n"
+      "zip2 v14.2d, v9.2d, v14.2d\n"
+      "zip1 v9.2d, v10.2d, v15.2d\n"
+      "zip2 v15.2d, v10.2d, v15.2d\n"
+      "zip1 v10.2d, v11.2d, v16.2d\n"
+      "zip2 v16.2d, v11.2d, v16.2d\n"
+      "zip1 v11.2d, v12.2d, v17.2d\n"
+      "zip2 v17.2d, v12.2d, v17.2d\n"
+      "zip1 v12.2d, v13.2d, v18.2d\n"
+      "zip2 v18.2d, v13.2d, v18.2d\n"
+      "zip1 v13.2d, v20.2d, v19.2d\n"
+      "zip2 v19.2d, v20.2d, v19.2d\n"
+      "zip1 v20.2d, v21.2d, v26.2d\n"
+      "zip2 v26.2d, v21.2d, v26.2d\n"
+      "zip1 v21.2d, v22.2d, v27.2d\n"
+      "zip2 v27.2d, v22.2d, v27.2d\n"
+      "zip1 v22.2d, v23.2d, v28.2d\n"
+      "zip2 v28.2d, v23.2d, v28.2d\n"
+      "zip1 v23.2d, v24.2d, v29.2d\n"
+      "zip2 v29.2d, v24.2d, v29.2d\n"
+      "zip1 v24.2d, v25.2d, v30.2d\n"
+      "zip2 v30.2d, v25.2d, v30.2d\n"
+      "zip1 v25.2d, v4.2d, v31.2d\n"
+      "zip2 v31.2d, v4.2d, v31.2d\n"
+      "b 105f\n"
+      "104:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "105:"  // Height 3: setup done
+      "mov x25, #0x0\n"
+      "106:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 107f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "cbnz x25, 108f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 108f\n"
+      "107:"  // Height 3: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "108:"  // Height 3: input setup done
+      "cmp x24, #0x4\n"
+      "blt 111f\n"
+      "ld1 { v0.4s }, [x23], #0x10\n"
+      "cmp x24, #0x8\n"
+      "blt 110f\n"
+      "109:"  // Height 3: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x22], #0x10\n"
+      "sub x24, x24, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v2.4s }, [x21], #0x10\n"
+      "cmp x24, #0x8\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q4, [x28, #0x40]\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x50]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0x70]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0xa0]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x23], #0x10\n"
+      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      "bge 109b\n"
+      "110:"  // Height 3: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x22], #0x10\n"
+      "sub x24, x24, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v2.4s }, [x21], #0x10\n"
+      "ldr q4, [x28, #0x0]\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x50]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0x70]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0xa0]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      "111:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x24, 114f\n"
+      "cbz x24, 114f\n"
+      "tbz x24, #1, 112f\n"
+      "ldr d0, [x23], #0x8\n"
+      "ldr d1, [x22], #0x8\n"
+      "ldr d2, [x21], #0x8\n"
+      "tbz x24, #0, 113f\n"
+      "ld1 { v0.s }[2], [x23]\n"
+      "ld1 { v1.s }[2], [x22]\n"
+      "ld1 { v2.s }[2], [x21]\n"
+      "b 113f\n"
+      "112:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x23, #0x0]\n"
+      "ldr s1, [x22, #0x0]\n"
+      "ldr s2, [x21, #0x0]\n"
+      "113:"  // Height 3: Multiply loop: Ragged operand read: Done
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q4, [x28, #0x0]\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q4, [x28, #0x40]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "ldr q5, [x28, #0x50]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0x70]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0xa0]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      "114:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 106b\n"
+      "uzp1 v4.2d, v8.2d, v14.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v14.2d\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "add x22, x26, x19, LSL #2\n"
+      "uzp1 v14.2d, v9.2d, v15.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v15.2d\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 v15.2d, v10.2d, v16.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v16.2d\n"
+      "uzp1 v16.2d, v11.2d, v17.2d\n"
+      "uzp2 v11.2d, v11.2d, v17.2d\n"
+      "uzp1 v17.2d, v12.2d, v18.2d\n"
+      "uzp2 v12.2d, v12.2d, v18.2d\n"
+      "uzp1 v18.2d, v13.2d, v19.2d\n"
+      "uzp2 v13.2d, v13.2d, v19.2d\n"
+      "uzp1 v20.2d, v20.2d, v26.2d\n"
+      "uzp1 v21.2d, v21.2d, v27.2d\n"
+      "uzp1 v22.2d, v22.2d, v28.2d\n"
+      "uzp1 v23.2d, v23.2d, v29.2d\n"
+      "uzp1 v24.2d, v24.2d, v30.2d\n"
+      "uzp1 v25.2d, v25.2d, v31.2d\n"
+      "tbz %x[flags], #1, 115f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v4.4s, v4.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmax v4.4s, v4.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "115:"  // Height 3: No activation
+      "cmp x9, #0x18\n"
+      "bge 128f\n"
+      "tbz x9, #4, 119f\n"
+      "st1 { v4.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x26], #0x10\n"
+      "st1 { v15.4s }, [x26], #0x10\n"
+      "st1 { v16.4s }, [x26], #0x10\n"
+      "st1 { v8.4s }, [x22], #0x10\n"
+      "st1 { v9.4s }, [x22], #0x10\n"
+      "st1 { v10.4s }, [x22], #0x10\n"
+      "st1 { v11.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "st1 { v21.4s }, [x21], #0x10\n"
+      "st1 { v22.4s }, [x21], #0x10\n"
+      "st1 { v23.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 117f\n"
+      "st1 { v17.4s }, [x26], #0x10\n"
+      "st1 { v12.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 116f\n"
+      "str d18, [x26], #0x8\n"
+      "str d13, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x9, #0, 127f\n"
+      "st1 { v18.s }[2], [x26]\n"
+      "st1 { v13.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "b 127f\n"
+      "116:"  // Height 3: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 127f\n"
+      "str s18, [x26, #0x0]\n"
+      "str s13, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "b 127f\n"
+      "117:"  // Height 3: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 118f\n"
+      "str d17, [x26], #0x8\n"
+      "str d12, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x9, #0, 127f\n"
+      "st1 { v17.s }[2], [x26]\n"
+      "st1 { v12.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "b 127f\n"
+      "118:"  // Height 3: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 127f\n"
+      "str s17, [x26, #0x0]\n"
+      "str s12, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "b 127f\n"
+      "119:"  // Height 3: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 123f\n"
+      "st1 { v4.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x26], #0x10\n"
+      "st1 { v8.4s }, [x22], #0x10\n"
+      "st1 { v9.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "st1 { v21.4s }, [x21], #0x10\n"
+      "tbz x9, #2, 121f\n"
+      "st1 { v15.4s }, [x26], #0x10\n"
+      "st1 { v10.4s }, [x22], #0x10\n"
+      "st1 { v22.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 120f\n"
+      "str d16, [x26], #0x8\n"
+      "str d11, [x22], #0x8\n"
+      "str d23, [x21], #0x8\n"
+      "tbz x9, #0, 127f\n"
+      "st1 { v16.s }[2], [x26]\n"
+      "st1 { v11.s }[2], [x22]\n"
+      "st1 { v23.s }[2], [x21]\n"
+      "b 127f\n"
+      "120:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 127f\n"
+      "str s16, [x26, #0x0]\n"
+      "str s11, [x22, #0x0]\n"
+      "str s23, [x21, #0x0]\n"
+      "b 127f\n"
+      "121:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 122f\n"
+      "str d15, [x26], #0x8\n"
+      "str d10, [x22], #0x8\n"
+      "str d22, [x21], #0x8\n"
+      "tbz x9, #0, 127f\n"
+      "st1 { v15.s }[2], [x26]\n"
+      "st1 { v10.s }[2], [x22]\n"
+      "st1 { v22.s }[2], [x21]\n"
+      "b 127f\n"
+      "122:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 127f\n"
+      "str s15, [x26, #0x0]\n"
+      "str s10, [x22, #0x0]\n"
+      "str s22, [x21, #0x0]\n"
+      "b 127f\n"
+      "123:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 125f\n"
+      "st1 { v4.4s }, [x26], #0x10\n"
+      "st1 { v8.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x21], #0x10\n"
+      "tbz x9, #1, 124f\n"
+      "str d14, [x26], #0x8\n"
+      "str d9, [x22], #0x8\n"
+      "str d21, [x21], #0x8\n"
+      "tbz x9, #0, 127f\n"
+      "st1 { v14.s }[2], [x26]\n"
+      "st1 { v9.s }[2], [x22]\n"
+      "st1 { v21.s }[2], [x21]\n"
+      "b 127f\n"
+      "124:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 127f\n"
+      "str s14, [x26, #0x0]\n"
+      "str s9, [x22, #0x0]\n"
+      "str s21, [x21, #0x0]\n"
+      "b 127f\n"
+      "125:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 126f\n"
+      "str d4, [x26], #0x8\n"
+      "str d8, [x22], #0x8\n"
+      "str d20, [x21], #0x8\n"
+      "tbz x9, #0, 127f\n"
+      "st1 { v4.s }[2], [x26]\n"
+      "st1 { v8.s }[2], [x22]\n"
+      "st1 { v20.s }[2], [x21]\n"
+      "b 127f\n"
+      "126:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s4, [x26, #0x0]\n"
+      "str s8, [x22, #0x0]\n"
+      "str s20, [x21, #0x0]\n"
+      "127:"  // Height 3: Partial direct writeback: Done
+      "b 129f\n"
+      "128:"  // Height 3: Full writeback
+      "str q4, [x26, #0x0]\n"
+      "str q14, [x26, #0x10]\n"
+      "str q15, [x26, #0x20]\n"
+      "str q16, [x26, #0x30]\n"
+      "str q17, [x26, #0x40]\n"
+      "str q18, [x26, #0x50]\n"
+      "add x26, x26, #0x60\n"
+      "str q8, [x22, #0x0]\n"
+      "str q9, [x22, #0x10]\n"
+      "str q10, [x22, #0x20]\n"
+      "str q11, [x22, #0x30]\n"
+      "str q12, [x22, #0x40]\n"
+      "str q13, [x22, #0x50]\n"
+      "str q20, [x21, #0x0]\n"
+      "str q21, [x21, #0x10]\n"
+      "str q22, [x21, #0x20]\n"
+      "str q23, [x21, #0x30]\n"
+      "str q24, [x21, #0x40]\n"
+      "str q25, [x21, #0x50]\n"
+      "129:"  // Height 3: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 88b\n"
+      "b 174f\n"
+      "130:"  // Height 4
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[bias]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x19, #0x10\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "131:"  // Height 4: Column loop
+      "cbz x27, 132f\n"
+      "ldr q8, [x27, #0x0]\n"
+      "zip2 v14.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x27, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x27, #0x20]\n"
+      "mov v20.16b, v8.16b\n"
+      "ldr q11, [x27, #0x30]\n"
+      "mov v26.16b, v14.16b\n"
+      "ldr q12, [x27, #0x40]\n"
+      "ldr q13, [x27, #0x50]\n"
+      "zip2 v15.2d, v9.2d, v9.2d\n"
+      "add x27, x27, #0x60\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v16.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v17.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "zip2 v18.2d, v12.2d, v12.2d\n"
+      "zip1 v12.2d, v12.2d, v12.2d\n"
+      "zip2 v19.2d, v13.2d, v13.2d\n"
+      "zip1 v13.2d, v13.2d, v13.2d\n"
+      "mov v21.16b, v9.16b\n"
+      "mov v27.16b, v15.16b\n"
+      "mov v22.16b, v10.16b\n"
+      "mov v28.16b, v16.16b\n"
+      "mov v23.16b, v11.16b\n"
+      "mov v29.16b, v17.16b\n"
+      "mov v24.16b, v12.16b\n"
+      "mov v30.16b, v18.16b\n"
+      "mov v25.16b, v13.16b\n"
+      "mov v31.16b, v19.16b\n"
+      "b 148f\n"
+      "132:"  // Height 4: no bias
+      "tbz %x[flags], #0, 147f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x9, #0x18\n"
+      "add x22, x26, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "bge 145f\n"
+      "tbz x9, #4, 136f\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "ld1 { v26.4s }, [x20], #0x10\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v15.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "ld1 { v27.4s }, [x20], #0x10\n"
+      "ld1 { v11.4s }, [x26], #0x10\n"
+      "ld1 { v16.4s }, [x22], #0x10\n"
+      "ld1 { v23.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "ld1 { v12.4s }, [x26], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v24.4s }, [x21], #0x10\n"
+      "ld1 { v29.4s }, [x20], #0x10\n"
+      "tbz x9, #2, 134f\n"
+      "ld1 { v13.4s }, [x26], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v30.4s }, [x20], #0x10\n"
+      "tbz x9, #1, 133f\n"
+      "mov x19, #0x58\n"
+      "ldr d20, [x26], #0x8\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d4, [x21], #0x8\n"
+      "ldr d31, [x20], #0x8\n"
+      "tbz x9, #0, 144f\n"
+      "ld1 { v20.s }[2], [x26]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "ld1 { v4.s }[2], [x21]\n"
+      "ld1 { v31.s }[2], [x20]\n"
+      "b 144f\n"
+      "133:"  // Height 4: Partial accumulate: partial_1_20
+      "mov x19, #0x50\n"
+      "tbz x9, #0, 144f\n"
+      "ldr s20, [x26, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "ldr s4, [x21, #0x0]\n"
+      "ldr s31, [x20, #0x0]\n"
+      "b 144f\n"
+      "134:"  // Height 4: Partial accumulate: partial_2_16
+      "tbz x9, #1, 135f\n"
+      "ldr d13, [x26], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "mov x19, #0x48\n"
+      "ldr d25, [x21], #0x8\n"
+      "ldr d30, [x20], #0x8\n"
+      "tbz x9, #0, 144f\n"
+      "ld1 { v13.s }[2], [x26]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "ld1 { v30.s }[2], [x20]\n"
+      "b 144f\n"
+      "135:"  // Height 4: Partial accumulate: partial_1_16
+      "mov x19, #0x40\n"
+      "tbz x9, #0, 144f\n"
+      "ldr s13, [x26, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "ldr s30, [x20, #0x0]\n"
+      "b 144f\n"
+      "136:"  // Height 4: Partial accumulate: partial_8_0
+      "tbz x9, #3, 140f\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "ld1 { v26.4s }, [x20], #0x10\n"
+      "ld1 { v10.4s }, [x26], #0x10\n"
+      "ld1 { v15.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "ld1 { v27.4s }, [x20], #0x10\n"
+      "tbz x9, #2, 138f\n"
+      "ld1 { v11.4s }, [x26], #0x10\n"
+      "ld1 { v16.4s }, [x22], #0x10\n"
+      "ld1 { v23.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "tbz x9, #1, 137f\n"
+      "mov x19, #0x38\n"
+      "ldr d12, [x26], #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d24, [x21], #0x8\n"
+      "ldr d29, [x20], #0x8\n"
+      "tbz x9, #0, 144f\n"
+      "ld1 { v12.s }[2], [x26]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "ld1 { v24.s }[2], [x21]\n"
+      "ld1 { v29.s }[2], [x20]\n"
+      "b 144f\n"
+      "137:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x9, #0, 144f\n"
+      "ldr s12, [x26, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "ldr s24, [x21, #0x0]\n"
+      "ldr s29, [x20, #0x0]\n"
+      "b 144f\n"
+      "138:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x9, #1, 139f\n"
+      "ldr d11, [x26], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d28, [x20], #0x8\n"
+      "tbz x9, #0, 144f\n"
+      "ld1 { v11.s }[2], [x26]\n"
+      "ld1 { v16.s }[2], [x22]\n"
+      "ld1 { v23.s }[2], [x21]\n"
+      "ld1 { v28.s }[2], [x20]\n"
+      "b 144f\n"
+      "139:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x9, #0, 144f\n"
+      "ldr s11, [x26, #0x0]\n"
+      "ldr s16, [x22, #0x0]\n"
+      "ldr s23, [x21, #0x0]\n"
+      "ldr s28, [x20, #0x0]\n"
+      "b 144f\n"
+      "140:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x9, #2, 142f\n"
+      "ld1 { v9.4s }, [x26], #0x10\n"
+      "ld1 { v14.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "ld1 { v26.4s }, [x20], #0x10\n"
+      "tbz x9, #1, 141f\n"
+      "mov x19, #0x18\n"
+      "ldr d10, [x26], #0x8\n"
+      "ldr d15, [x22], #0x8\n"
+      "ldr d22, [x21], #0x8\n"
+      "ldr d27, [x20], #0x8\n"
+      "tbz x9, #0, 144f\n"
+      "ld1 { v10.s }[2], [x26]\n"
+      "ld1 { v15.s }[2], [x22]\n"
+      "ld1 { v22.s }[2], [x21]\n"
+      "ld1 { v27.s }[2], [x20]\n"
+      "b 144f\n"
+      "141:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x9, #0, 144f\n"
+      "ldr s10, [x26, #0x0]\n"
+      "ldr s15, [x22, #0x0]\n"
+      "ldr s22, [x21, #0x0]\n"
+      "ldr s27, [x20, #0x0]\n"
+      "b 144f\n"
+      "142:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x9, #1, 143f\n"
+      "ldr d9, [x26], #0x8\n"
+      "ldr d14, [x22], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d21, [x21], #0x8\n"
+      "ldr d26, [x20], #0x8\n"
+      "tbz x9, #0, 144f\n"
+      "ld1 { v9.s }[2], [x26]\n"
+      "ld1 { v14.s }[2], [x22]\n"
+      "ld1 { v21.s }[2], [x21]\n"
+      "ld1 { v26.s }[2], [x20]\n"
+      "b 144f\n"
+      "143:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s9, [x26, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s14, [x22, #0x0]\n"
+      "ldr s21, [x21, #0x0]\n"
+      "ldr s26, [x20, #0x0]\n"
+      "144:"  // Height 4: Partial accumulate: Done
+      "sub x26, x26, x19\n"
+      "b 146f\n"
+      "145:"  // Height 4: full accumulate
+      "ldr q9, [x26, #0x0]\n"
+      "ldr q10, [x26, #0x10]\n"
+      "ldr q11, [x26, #0x20]\n"
+      "ldr q12, [x26, #0x30]\n"
+      "ldr q13, [x26, #0x40]\n"
+      "ldr q20, [x26, #0x50]\n"
+      "ldr q14, [x22, #0x0]\n"
+      "ldr q15, [x22, #0x10]\n"
+      "ldr q16, [x22, #0x20]\n"
+      "ldr q17, [x22, #0x30]\n"
+      "ldr q18, [x22, #0x40]\n"
+      "ldr q19, [x22, #0x50]\n"
+      "ldr q21, [x21, #0x0]\n"
+      "ldr q22, [x21, #0x10]\n"
+      "ldr q23, [x21, #0x20]\n"
+      "ldr q24, [x21, #0x30]\n"
+      "ldr q25, [x21, #0x40]\n"
+      "ldr q4, [x21, #0x50]\n"
+      "ldr q26, [x20, #0x0]\n"
+      "ldr q27, [x20, #0x10]\n"
+      "ldr q28, [x20, #0x20]\n"
+      "ldr q29, [x20, #0x30]\n"
+      "ldr q30, [x20, #0x40]\n"
+      "ldr q31, [x20, #0x50]\n"
+      "146:"  // Height 4: MMLA fixup
+      "zip1 v8.2d, v9.2d, v14.2d\n"
+      "zip2 v14.2d, v9.2d, v14.2d\n"
+      "zip1 v9.2d, v10.2d, v15.2d\n"
+      "zip2 v15.2d, v10.2d, v15.2d\n"
+      "zip1 v10.2d, v11.2d, v16.2d\n"
+      "zip2 v16.2d, v11.2d, v16.2d\n"
+      "zip1 v11.2d, v12.2d, v17.2d\n"
+      "zip2 v17.2d, v12.2d, v17.2d\n"
+      "zip1 v12.2d, v13.2d, v18.2d\n"
+      "zip2 v18.2d, v13.2d, v18.2d\n"
+      "zip1 v13.2d, v20.2d, v19.2d\n"
+      "zip2 v19.2d, v20.2d, v19.2d\n"
+      "zip1 v20.2d, v21.2d, v26.2d\n"
+      "zip2 v26.2d, v21.2d, v26.2d\n"
+      "zip1 v21.2d, v22.2d, v27.2d\n"
+      "zip2 v27.2d, v22.2d, v27.2d\n"
+      "zip1 v22.2d, v23.2d, v28.2d\n"
+      "zip2 v28.2d, v23.2d, v28.2d\n"
+      "zip1 v23.2d, v24.2d, v29.2d\n"
+      "zip2 v29.2d, v24.2d, v29.2d\n"
+      "zip1 v24.2d, v25.2d, v30.2d\n"
+      "zip2 v30.2d, v25.2d, v30.2d\n"
+      "zip1 v25.2d, v4.2d, v31.2d\n"
+      "zip2 v31.2d, v4.2d, v31.2d\n"
+      "b 148f\n"
+      "147:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "148:"  // Height 4: setup done
+      "mov x25, #0x0\n"
+      "149:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 150f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x25, 151f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 151f\n"
+      "150:"  // Height 4: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "151:"  // Height 4: input setup done
+      "cmp x24, #0x4\n"
+      "blt 154f\n"
+      "ld1 { v0.4s }, [x23], #0x10\n"
+      "cmp x24, #0x8\n"
+      "blt 153f\n"
+      "152:"  // Height 4: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x22], #0x10\n"
+      "sub x24, x24, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v2.4s }, [x21], #0x10\n"
+      "cmp x24, #0x8\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ld1 { v3.4s }, [x20], #0x10\n"
+      "ldr q4, [x28, #0x0]\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x40]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x50]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0x70]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0xa0]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x23], #0x10\n"
+      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      "bge 152b\n"
+      "153:"  // Height 4: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x22], #0x10\n"
+      "sub x24, x24, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v2.4s }, [x21], #0x10\n"
+      "ld1 { v3.4s }, [x20], #0x10\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q4, [x28, #0x0]\n"
+      "ldr q5, [x28, #0x10]\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      "ldr q6, [x28, #0x20]\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      "ldr q4, [x28, #0x40]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q5, [x28, #0x50]\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0x70]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0xa0]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      "154:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x24, 157f\n"
+      "cbz x24, 157f\n"
+      "tbz x24, #1, 155f\n"
+      "ldr d0, [x23], #0x8\n"
+      "ldr d1, [x22], #0x8\n"
+      "ldr d2, [x21], #0x8\n"
+      "ldr d3, [x20], #0x8\n"
+      "tbz x24, #0, 156f\n"
+      "ld1 { v0.s }[2], [x23]\n"
+      "ld1 { v1.s }[2], [x22]\n"
+      "ld1 { v2.s }[2], [x21]\n"
+      "ld1 { v3.s }[2], [x20]\n"
+      "b 156f\n"
+      "155:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x23, #0x0]\n"
+      "ldr s1, [x22, #0x0]\n"
+      "ldr s2, [x21, #0x0]\n"
+      "ldr s3, [x20, #0x0]\n"
+      "156:"  // Height 4: Multiply loop: Ragged operand read: Done
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q4, [x28, #0x0]\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q5, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x40]\n"
+      ".inst 0x6e45ec0e  // bfmmla v14.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5a  // bfmmla v26.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x50]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5b  // bfmmla v27.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0x70]\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      "ldr q4, [x28, #0x80]\n"
+      ".inst 0x6e45ec10  // bfmmla v16.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5c  // bfmmla v28.4s, v2.8h, v5.8h\n"
+      "ldr q5, [x28, #0x90]\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec57  // bfmmla v23.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x28, #0xa0]\n"
+      ".inst 0x6e47ec11  // bfmmla v17.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5d  // bfmmla v29.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x28, #0xb0]\n"
+      "add x28, x28, #0xc0\n"
+      ".inst 0x6e44ec0c  // bfmmla v12.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec58  // bfmmla v24.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec12  // bfmmla v18.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec5e  // bfmmla v30.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e46ec0d  // bfmmla v13.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec59  // bfmmla v25.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec13  // bfmmla v19.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec5f  // bfmmla v31.4s, v2.8h, v7.8h\n"
+      "157:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 149b\n"
+      "uzp1 v4.2d, v8.2d, v14.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v14.2d\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "add x22, x26, x19, LSL #2\n"
+      "uzp1 v14.2d, v9.2d, v15.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v15.2d\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 v15.2d, v10.2d, v16.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "uzp2 v10.2d, v10.2d, v16.2d\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "uzp1 v16.2d, v11.2d, v17.2d\n"
+      "uzp2 v11.2d, v11.2d, v17.2d\n"
+      "uzp1 v17.2d, v12.2d, v18.2d\n"
+      "uzp2 v12.2d, v12.2d, v18.2d\n"
+      "uzp1 v18.2d, v13.2d, v19.2d\n"
+      "uzp2 v13.2d, v13.2d, v19.2d\n"
+      "uzp1 v19.2d, v20.2d, v26.2d\n"
+      "uzp2 v20.2d, v20.2d, v26.2d\n"
+      "uzp1 v26.2d, v21.2d, v27.2d\n"
+      "uzp2 v21.2d, v21.2d, v27.2d\n"
+      "uzp1 v27.2d, v22.2d, v28.2d\n"
+      "uzp2 v22.2d, v22.2d, v28.2d\n"
+      "uzp1 v28.2d, v23.2d, v29.2d\n"
+      "uzp2 v23.2d, v23.2d, v29.2d\n"
+      "uzp1 v29.2d, v24.2d, v30.2d\n"
+      "uzp2 v24.2d, v24.2d, v30.2d\n"
+      "uzp1 v30.2d, v25.2d, v31.2d\n"
+      "uzp2 v25.2d, v25.2d, v31.2d\n"
+      "tbz %x[flags], #1, 158f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v4.4s, v4.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmax v4.4s, v4.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmin v28.4s, v28.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "fmax v28.4s, v28.4s, v1.4s\n"
+      "fmin v29.4s, v29.4s, v0.4s\n"
+      "fmin v30.4s, v30.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v29.4s, v29.4s, v1.4s\n"
+      "fmax v30.4s, v30.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "158:"  // Height 4: No activation
+      "cmp x9, #0x18\n"
+      "bge 171f\n"
+      "tbz x9, #4, 162f\n"
+      "st1 { v4.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x26], #0x10\n"
+      "st1 { v15.4s }, [x26], #0x10\n"
+      "st1 { v16.4s }, [x26], #0x10\n"
+      "st1 { v8.4s }, [x22], #0x10\n"
+      "st1 { v9.4s }, [x22], #0x10\n"
+      "st1 { v10.4s }, [x22], #0x10\n"
+      "st1 { v11.4s }, [x22], #0x10\n"
+      "st1 { v19.4s }, [x21], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v27.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v20.4s }, [x20], #0x10\n"
+      "st1 { v21.4s }, [x20], #0x10\n"
+      "st1 { v22.4s }, [x20], #0x10\n"
+      "st1 { v23.4s }, [x20], #0x10\n"
+      "tbz x9, #2, 160f\n"
+      "st1 { v17.4s }, [x26], #0x10\n"
+      "st1 { v12.4s }, [x22], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "st1 { v24.4s }, [x20], #0x10\n"
+      "tbz x9, #1, 159f\n"
+      "str d18, [x26], #0x8\n"
+      "str d13, [x22], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "str d25, [x20], #0x8\n"
+      "tbz x9, #0, 170f\n"
+      "st1 { v18.s }[2], [x26]\n"
+      "st1 { v13.s }[2], [x22]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "st1 { v25.s }[2], [x20]\n"
+      "b 170f\n"
+      "159:"  // Height 4: Partial direct writeback: partial_1_20
+      "tbz x9, #0, 170f\n"
+      "str s18, [x26, #0x0]\n"
+      "str s13, [x22, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "str s25, [x20, #0x0]\n"
+      "b 170f\n"
+      "160:"  // Height 4: Partial direct writeback: partial_2_16
+      "tbz x9, #1, 161f\n"
+      "str d17, [x26], #0x8\n"
+      "str d12, [x22], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "str d24, [x20], #0x8\n"
+      "tbz x9, #0, 170f\n"
+      "st1 { v17.s }[2], [x26]\n"
+      "st1 { v12.s }[2], [x22]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "st1 { v24.s }[2], [x20]\n"
+      "b 170f\n"
+      "161:"  // Height 4: Partial direct writeback: partial_1_16
+      "tbz x9, #0, 170f\n"
+      "str s17, [x26, #0x0]\n"
+      "str s12, [x22, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "str s24, [x20, #0x0]\n"
+      "b 170f\n"
+      "162:"  // Height 4: Partial direct writeback: partial_8_0
+      "tbz x9, #3, 166f\n"
+      "st1 { v4.4s }, [x26], #0x10\n"
+      "st1 { v14.4s }, [x26], #0x10\n"
+      "st1 { v8.4s }, [x22], #0x10\n"
+      "st1 { v9.4s }, [x22], #0x10\n"
+      "st1 { v19.4s }, [x21], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "st1 { v20.4s }, [x20], #0x10\n"
+      "st1 { v21.4s }, [x20], #0x10\n"
+      "tbz x9, #2, 164f\n"
+      "st1 { v15.4s }, [x26], #0x10\n"
+      "st1 { v10.4s }, [x22], #0x10\n"
+      "st1 { v27.4s }, [x21], #0x10\n"
+      "st1 { v22.4s }, [x20], #0x10\n"
+      "tbz x9, #1, 163f\n"
+      "str d16, [x26], #0x8\n"
+      "str d11, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "str d23, [x20], #0x8\n"
+      "tbz x9, #0, 170f\n"
+      "st1 { v16.s }[2], [x26]\n"
+      "st1 { v11.s }[2], [x22]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "st1 { v23.s }[2], [x20]\n"
+      "b 170f\n"
+      "163:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 170f\n"
+      "str s16, [x26, #0x0]\n"
+      "str s11, [x22, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "str s23, [x20, #0x0]\n"
+      "b 170f\n"
+      "164:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 165f\n"
+      "str d15, [x26], #0x8\n"
+      "str d10, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "str d22, [x20], #0x8\n"
+      "tbz x9, #0, 170f\n"
+      "st1 { v15.s }[2], [x26]\n"
+      "st1 { v10.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "st1 { v22.s }[2], [x20]\n"
+      "b 170f\n"
+      "165:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 170f\n"
+      "str s15, [x26, #0x0]\n"
+      "str s10, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "str s22, [x20, #0x0]\n"
+      "b 170f\n"
+      "166:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 168f\n"
+      "st1 { v4.4s }, [x26], #0x10\n"
+      "st1 { v8.4s }, [x22], #0x10\n"
+      "st1 { v19.4s }, [x21], #0x10\n"
+      "st1 { v20.4s }, [x20], #0x10\n"
+      "tbz x9, #1, 167f\n"
+      "str d14, [x26], #0x8\n"
+      "str d9, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "str d21, [x20], #0x8\n"
+      "tbz x9, #0, 170f\n"
+      "st1 { v14.s }[2], [x26]\n"
+      "st1 { v9.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "st1 { v21.s }[2], [x20]\n"
+      "b 170f\n"
+      "167:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 170f\n"
+      "str s14, [x26, #0x0]\n"
+      "str s9, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "str s21, [x20, #0x0]\n"
+      "b 170f\n"
+      "168:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 169f\n"
+      "str d4, [x26], #0x8\n"
+      "str d8, [x22], #0x8\n"
+      "str d19, [x21], #0x8\n"
+      "str d20, [x20], #0x8\n"
+      "tbz x9, #0, 170f\n"
+      "st1 { v4.s }[2], [x26]\n"
+      "st1 { v8.s }[2], [x22]\n"
+      "st1 { v19.s }[2], [x21]\n"
+      "st1 { v20.s }[2], [x20]\n"
+      "b 170f\n"
+      "169:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s4, [x26, #0x0]\n"
+      "str s8, [x22, #0x0]\n"
+      "str s19, [x21, #0x0]\n"
+      "str s20, [x20, #0x0]\n"
+      "170:"  // Height 4: Partial direct writeback: Done
+      "b 172f\n"
+      "171:"  // Height 4: Full writeback
+      "str q4, [x26, #0x0]\n"
+      "str q14, [x26, #0x10]\n"
+      "str q15, [x26, #0x20]\n"
+      "str q16, [x26, #0x30]\n"
+      "str q17, [x26, #0x40]\n"
+      "str q18, [x26, #0x50]\n"
+      "add x26, x26, #0x60\n"
+      "str q8, [x22, #0x0]\n"
+      "str q9, [x22, #0x10]\n"
+      "str q10, [x22, #0x20]\n"
+      "str q11, [x22, #0x30]\n"
+      "str q12, [x22, #0x40]\n"
+      "str q13, [x22, #0x50]\n"
+      "str q19, [x21, #0x0]\n"
+      "str q26, [x21, #0x10]\n"
+      "str q27, [x21, #0x20]\n"
+      "str q28, [x21, #0x30]\n"
+      "str q29, [x21, #0x40]\n"
+      "str q30, [x21, #0x50]\n"
+      "str q20, [x20, #0x0]\n"
+      "str q21, [x20, #0x10]\n"
+      "str q22, [x20, #0x20]\n"
+      "str q23, [x20, #0x30]\n"
+      "str q24, [x20, #0x40]\n"
+      "str q25, [x20, #0x50]\n"
+      "172:"  // Height 4: Writeback done
+      "subs x9, x9, #0x18\n"
+      "bgt 131b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 174f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 173f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "173:"  // Update direct input
+      "mov x19, #0x10\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "174:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
new file mode 100644
index 0000000000..f5e9009f6d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16.hpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_fp32bf16fp32_mmla_6x16( ARGLIST );
+
+class cls_a64_hybrid_fp32bf16fp32_mmla_6x16
+{
+public:
+    typedef float lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 16.37 };
+                case CPUModel::A510:
+                    return { 6.70 };
+                case CPUModel::V1:
+                    return { 26.64 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_fp32bf16fp32_mmla_6x16;
+    cls_a64_hybrid_fp32bf16fp32_mmla_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp
new file mode 100644
index 0000000000..19dbf0588e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp
@@ -0,0 +1,3137 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32bf16fp32_mmla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 176f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 141f\n"
+      "beq 106f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 71f\n"
+      "beq 36f\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[bias]\n"
+      "mov x28, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "cbz x9, 3f\n"
+      "ldr q8, [x9, #0x0]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x9, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "add x9, x9, #0x40\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "b 15f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 14f\n"
+      "cmp x11, #0x10\n"
+      "bge 12f\n"
+      "tbz x11, #3, 7f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "tbz x11, #2, 5f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "tbz x11, #1, 4f\n"
+      "mov x19, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "b 11f\n"
+      "4:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x11, #0, 11f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "b 11f\n"
+      "5:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x11, #1, 6f\n"
+      "ldr d11, [x28], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "b 11f\n"
+      "6:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x11, #0, 11f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "b 11f\n"
+      "7:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x11, #2, 9f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "tbz x11, #1, 8f\n"
+      "ldr d10, [x28], #0x8\n"
+      "mov x19, #0x18\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "b 11f\n"
+      "8:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x11, #0, 11f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "b 11f\n"
+      "9:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x11, #1, 10f\n"
+      "ldr d9, [x28], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x11, #0, 11f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "b 11f\n"
+      "10:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x19, #0x0\n"
+      "11:"  // Height 1: Partial accumulate: Done
+      "sub x28, x28, x19\n"
+      "b 13f\n"
+      "12:"  // Height 1: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "13:"  // Height 1: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 15f\n"
+      "14:"  // Height 1: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "15:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "16:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 17f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 18f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 18f\n"
+      "17:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "18:"  // Height 1: input setup done
+      "cmp x26, #0x4\n"
+      "blt 21f\n"
+      "ld1 { v0.4s }, [x25], #0x10\n"
+      "cmp x26, #0x8\n"
+      "blt 20f\n"
+      "19:"  // Height 1: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      "sub x26, x26, #0x4\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q7, [x10, #0x10]\n"
+      "cmp x26, #0x8\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x25], #0x10\n"
+      "bge 19b\n"
+      "20:"  // Height 1: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      "sub x26, x26, #0x4\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q7, [x10, #0x10]\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "21:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x26, 24f\n"
+      "cbz x26, 24f\n"
+      "tbz x26, #1, 22f\n"
+      "ldr d0, [x25], #0x8\n"
+      "tbz x26, #0, 23f\n"
+      "ld1 { v0.s }[2], [x25]\n"
+      "b 23f\n"
+      "22:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x25, #0x0]\n"
+      "23:"  // Height 1: Multiply loop: Ragged operand read: Done
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "24:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 16b\n"
+      "uzp1 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "uzp1 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v11.2d, v11.2d, v15.2d\n"
+      "tbz %x[flags], #1, 25f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "25:"  // Height 1: No activation
+      "cmp x11, #0x10\n"
+      "bge 34f\n"
+      "tbz x11, #3, 29f\n"
+      "st1 { v8.4s }, [x28], #0x10\n"
+      "st1 { v9.4s }, [x28], #0x10\n"
+      "tbz x11, #2, 27f\n"
+      "st1 { v10.4s }, [x28], #0x10\n"
+      "tbz x11, #1, 26f\n"
+      "str d11, [x28], #0x8\n"
+      "tbz x11, #0, 33f\n"
+      "st1 { v11.s }[2], [x28]\n"
+      "b 33f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 33f\n"
+      "str s11, [x28, #0x0]\n"
+      "b 33f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 28f\n"
+      "str d10, [x28], #0x8\n"
+      "tbz x11, #0, 33f\n"
+      "st1 { v10.s }[2], [x28]\n"
+      "b 33f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 33f\n"
+      "str s10, [x28, #0x0]\n"
+      "b 33f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 31f\n"
+      "st1 { v8.4s }, [x28], #0x10\n"
+      "tbz x11, #1, 30f\n"
+      "str d9, [x28], #0x8\n"
+      "tbz x11, #0, 33f\n"
+      "st1 { v9.s }[2], [x28]\n"
+      "b 33f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 33f\n"
+      "str s9, [x28, #0x0]\n"
+      "b 33f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 32f\n"
+      "str d8, [x28], #0x8\n"
+      "tbz x11, #0, 33f\n"
+      "st1 { v8.s }[2], [x28]\n"
+      "b 33f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x28, #0x0]\n"
+      "33:"  // Height 1: Partial direct writeback: Done
+      "b 35f\n"
+      "34:"  // Height 1: Full writeback
+      "str q8, [x28, #0x0]\n"
+      "str q9, [x28, #0x10]\n"
+      "str q10, [x28, #0x20]\n"
+      "str q11, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "35:"  // Height 1: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 2b\n"
+      "b 212f\n"
+      "36:"  // Height 2
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "37:"  // Height 2: Column loop
+      "cbz x9, 38f\n"
+      "ldr q8, [x9, #0x0]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x9, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x9, #0x20]\n"
+      "ldr q11, [x9, #0x30]\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "add x9, x9, #0x40\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "b 50f\n"
+      "38:"  // Height 2: no bias
+      "tbz %x[flags], #0, 49f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x28, x19, LSL #2\n"
+      "bge 47f\n"
+      "tbz x11, #3, 42f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 40f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 39f\n"
+      "mov x19, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "tbz x11, #0, 46f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "b 46f\n"
+      "39:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x11, #0, 46f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "b 46f\n"
+      "40:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x11, #1, 41f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x19, #0x28\n"
+      "tbz x11, #0, 46f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "b 46f\n"
+      "41:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x11, #0, 46f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "b 46f\n"
+      "42:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x11, #2, 44f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 43f\n"
+      "mov x19, #0x18\n"
+      "ldr d10, [x28], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "tbz x11, #0, 46f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "b 46f\n"
+      "43:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x11, #0, 46f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "b 46f\n"
+      "44:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x11, #1, 45f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x19, #0x8\n"
+      "tbz x11, #0, 46f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "b 46f\n"
+      "45:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "46:"  // Height 2: Partial accumulate: Done
+      "sub x28, x28, x19\n"
+      "b 48f\n"
+      "47:"  // Height 2: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "48:"  // Height 2: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 50f\n"
+      "49:"  // Height 2: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "50:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "51:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 52f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 53f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "b 53f\n"
+      "52:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "53:"  // Height 2: input setup done
+      "cmp x26, #0x4\n"
+      "blt 56f\n"
+      "ld1 { v0.4s }, [x25], #0x10\n"
+      "cmp x26, #0x8\n"
+      "blt 55f\n"
+      "54:"  // Height 2: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x24], #0x10\n"
+      "sub x26, x26, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      "cmp x26, #0x8\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q7, [x10, #0x10]\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x25], #0x10\n"
+      "bge 54b\n"
+      "55:"  // Height 2: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x24], #0x10\n"
+      "sub x26, x26, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "56:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x26, 59f\n"
+      "cbz x26, 59f\n"
+      "tbz x26, #1, 57f\n"
+      "ldr d0, [x25], #0x8\n"
+      "ldr d1, [x24], #0x8\n"
+      "tbz x26, #0, 58f\n"
+      "ld1 { v0.s }[2], [x25]\n"
+      "ld1 { v1.s }[2], [x24]\n"
+      "b 58f\n"
+      "57:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x25, #0x0]\n"
+      "ldr s1, [x24, #0x0]\n"
+      "58:"  // Height 2: Multiply loop: Ragged operand read: Done
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "59:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 51b\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "tbz %x[flags], #1, 60f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v6.4s, v6.4s, v0.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v6.4s, v6.4s, v1.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "60:"  // Height 2: No activation
+      "cmp x11, #0x10\n"
+      "bge 69f\n"
+      "tbz x11, #3, 64f\n"
+      "st1 { v6.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "tbz x11, #2, 62f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 61f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "tbz x11, #0, 68f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "b 68f\n"
+      "61:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 68f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "b 68f\n"
+      "62:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 63f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "tbz x11, #0, 68f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "b 68f\n"
+      "63:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 68f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "b 68f\n"
+      "64:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 66f\n"
+      "st1 { v6.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "tbz x11, #1, 65f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "tbz x11, #0, 68f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "b 68f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 68f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "b 68f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 67f\n"
+      "str d6, [x28], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "tbz x11, #0, 68f\n"
+      "st1 { v6.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "b 68f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s6, [x28, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "68:"  // Height 2: Partial direct writeback: Done
+      "b 70f\n"
+      "69:"  // Height 2: Full writeback
+      "str q6, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "70:"  // Height 2: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 37b\n"
+      "b 212f\n"
+      "71:"  // Height 3
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "72:"  // Height 3: Column loop
+      "cbz x9, 73f\n"
+      "ldr q8, [x9, #0x0]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x9, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x9, #0x20]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "mov v20.16b, v12.16b\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "b 85f\n"
+      "73:"  // Height 3: no bias
+      "tbz %x[flags], #0, 84f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "bge 82f\n"
+      "tbz x11, #3, 77f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 75f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 74f\n"
+      "mov x19, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "tbz x11, #0, 81f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "b 81f\n"
+      "74:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x11, #0, 81f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "b 81f\n"
+      "75:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x11, #1, 76f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "tbz x11, #0, 81f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "b 81f\n"
+      "76:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x11, #0, 81f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "b 81f\n"
+      "77:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x11, #2, 79f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 78f\n"
+      "mov x19, #0x18\n"
+      "ldr d10, [x28], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "tbz x11, #0, 81f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "b 81f\n"
+      "78:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x11, #0, 81f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "b 81f\n"
+      "79:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x11, #1, 80f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "tbz x11, #0, 81f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "b 81f\n"
+      "80:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "81:"  // Height 3: Partial accumulate: Done
+      "sub x28, x28, x19\n"
+      "b 83f\n"
+      "82:"  // Height 3: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "83:"  // Height 3: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 85f\n"
+      "84:"  // Height 3: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "85:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "86:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 87f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 88f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 88f\n"
+      "87:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "88:"  // Height 3: input setup done
+      "cmp x26, #0x4\n"
+      "blt 91f\n"
+      "ld1 { v0.4s }, [x25], #0x10\n"
+      "cmp x26, #0x8\n"
+      "blt 90f\n"
+      "89:"  // Height 3: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x24], #0x10\n"
+      "sub x26, x26, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v2.4s }, [x23], #0x10\n"
+      "cmp x26, #0x8\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x25], #0x10\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      "bge 89b\n"
+      "90:"  // Height 3: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x24], #0x10\n"
+      "sub x26, x26, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v2.4s }, [x23], #0x10\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      "91:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x26, 94f\n"
+      "cbz x26, 94f\n"
+      "tbz x26, #1, 92f\n"
+      "ldr d0, [x25], #0x8\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d2, [x23], #0x8\n"
+      "tbz x26, #0, 93f\n"
+      "ld1 { v0.s }[2], [x25]\n"
+      "ld1 { v1.s }[2], [x24]\n"
+      "ld1 { v2.s }[2], [x23]\n"
+      "b 93f\n"
+      "92:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x25, #0x0]\n"
+      "ldr s1, [x24, #0x0]\n"
+      "ldr s2, [x23, #0x0]\n"
+      "93:"  // Height 3: Multiply loop: Ragged operand read: Done
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      "94:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 86b\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "add x23, x24, x19, LSL #2\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "tbz %x[flags], #1, 95f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v6.4s, v6.4s, v0.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v6.4s, v6.4s, v1.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "95:"  // Height 3: No activation
+      "cmp x11, #0x10\n"
+      "bge 104f\n"
+      "tbz x11, #3, 99f\n"
+      "st1 { v6.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "st1 { v17.4s }, [x23], #0x10\n"
+      "tbz x11, #2, 97f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v18.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 96f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d19, [x23], #0x8\n"
+      "tbz x11, #0, 103f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v19.s }[2], [x23]\n"
+      "b 103f\n"
+      "96:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 103f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s19, [x23, #0x0]\n"
+      "b 103f\n"
+      "97:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 98f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d18, [x23], #0x8\n"
+      "tbz x11, #0, 103f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v18.s }[2], [x23]\n"
+      "b 103f\n"
+      "98:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 103f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s18, [x23, #0x0]\n"
+      "b 103f\n"
+      "99:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 101f\n"
+      "st1 { v6.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v16.4s }, [x23], #0x10\n"
+      "tbz x11, #1, 100f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d17, [x23], #0x8\n"
+      "tbz x11, #0, 103f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v17.s }[2], [x23]\n"
+      "b 103f\n"
+      "100:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 103f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s17, [x23, #0x0]\n"
+      "b 103f\n"
+      "101:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 102f\n"
+      "str d6, [x28], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d16, [x23], #0x8\n"
+      "tbz x11, #0, 103f\n"
+      "st1 { v6.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v16.s }[2], [x23]\n"
+      "b 103f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s6, [x28, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s16, [x23, #0x0]\n"
+      "103:"  // Height 3: Partial direct writeback: Done
+      "b 105f\n"
+      "104:"  // Height 3: Full writeback
+      "str q6, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q16, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q18, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "105:"  // Height 3: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 72b\n"
+      "b 212f\n"
+      "106:"  // Height 4
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "107:"  // Height 4: Column loop
+      "cbz x9, 108f\n"
+      "ldr q8, [x9, #0x0]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x9, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x9, #0x20]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "mov v20.16b, v12.16b\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "b 120f\n"
+      "108:"  // Height 4: no bias
+      "tbz %x[flags], #0, 119f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "bge 117f\n"
+      "tbz x11, #3, 112f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 110f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 109f\n"
+      "mov x19, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "tbz x11, #0, 116f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "b 116f\n"
+      "109:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x11, #0, 116f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "b 116f\n"
+      "110:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x11, #1, 111f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "tbz x11, #0, 116f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "b 116f\n"
+      "111:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x11, #0, 116f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "b 116f\n"
+      "112:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x11, #2, 114f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 113f\n"
+      "mov x19, #0x18\n"
+      "ldr d10, [x28], #0x8\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "tbz x11, #0, 116f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "b 116f\n"
+      "113:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x11, #0, 116f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "b 116f\n"
+      "114:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x11, #1, 115f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "tbz x11, #0, 116f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "b 116f\n"
+      "115:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "116:"  // Height 4: Partial accumulate: Done
+      "sub x28, x28, x19\n"
+      "b 118f\n"
+      "117:"  // Height 4: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "118:"  // Height 4: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 120f\n"
+      "119:"  // Height 4: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "120:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "121:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 122f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 123f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 123f\n"
+      "122:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "123:"  // Height 4: input setup done
+      "cmp x26, #0x4\n"
+      "blt 126f\n"
+      "ld1 { v0.4s }, [x25], #0x10\n"
+      "cmp x26, #0x8\n"
+      "blt 125f\n"
+      "124:"  // Height 4: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x24], #0x10\n"
+      "sub x26, x26, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v2.4s }, [x23], #0x10\n"
+      "cmp x26, #0x8\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ld1 { v3.4s }, [x22], #0x10\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x25], #0x10\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      "bge 124b\n"
+      "125:"  // Height 4: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x24], #0x10\n"
+      "sub x26, x26, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v2.4s }, [x23], #0x10\n"
+      "ld1 { v3.4s }, [x22], #0x10\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      "126:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x26, 129f\n"
+      "cbz x26, 129f\n"
+      "tbz x26, #1, 127f\n"
+      "ldr d0, [x25], #0x8\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d2, [x23], #0x8\n"
+      "ldr d3, [x22], #0x8\n"
+      "tbz x26, #0, 128f\n"
+      "ld1 { v0.s }[2], [x25]\n"
+      "ld1 { v1.s }[2], [x24]\n"
+      "ld1 { v2.s }[2], [x23]\n"
+      "ld1 { v3.s }[2], [x22]\n"
+      "b 128f\n"
+      "127:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x25, #0x0]\n"
+      "ldr s1, [x24, #0x0]\n"
+      "ldr s2, [x23, #0x0]\n"
+      "ldr s3, [x22, #0x0]\n"
+      "128:"  // Height 4: Multiply loop: Ragged operand read: Done
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      "129:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 121b\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "add x23, x24, x19, LSL #2\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "tbz %x[flags], #1, 130f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v6.4s, v6.4s, v0.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v6.4s, v6.4s, v1.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "130:"  // Height 4: No activation
+      "cmp x11, #0x10\n"
+      "bge 139f\n"
+      "tbz x11, #3, 134f\n"
+      "st1 { v6.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "tbz x11, #2, 132f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 131f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "tbz x11, #0, 138f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "b 138f\n"
+      "131:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 138f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "b 138f\n"
+      "132:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 133f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "tbz x11, #0, 138f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "b 138f\n"
+      "133:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 138f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "b 138f\n"
+      "134:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 136f\n"
+      "st1 { v6.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "tbz x11, #1, 135f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "tbz x11, #0, 138f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "b 138f\n"
+      "135:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 138f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "b 138f\n"
+      "136:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 137f\n"
+      "str d6, [x28], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "tbz x11, #0, 138f\n"
+      "st1 { v6.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "b 138f\n"
+      "137:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s6, [x28, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "138:"  // Height 4: Partial direct writeback: Done
+      "b 140f\n"
+      "139:"  // Height 4: Full writeback
+      "str q6, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q15, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q22, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "140:"  // Height 4: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 107b\n"
+      "b 212f\n"
+      "141:"  // Height 5
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "142:"  // Height 5: Column loop
+      "cbz x9, 143f\n"
+      "ldr q8, [x9, #0x0]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x9, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x9, #0x20]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "mov v28.16b, v12.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v29.16b, v13.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v30.16b, v14.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v31.16b, v15.16b\n"
+      "b 155f\n"
+      "143:"  // Height 5: no bias
+      "tbz %x[flags], #0, 154f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "bge 152f\n"
+      "tbz x11, #3, 147f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 145f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 144f\n"
+      "ldr d16, [x28], #0x8\n"
+      "mov x19, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d6, [x21], #0x8\n"
+      "tbz x11, #0, 151f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v6.s }[2], [x21]\n"
+      "b 151f\n"
+      "144:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x11, #0, 151f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s6, [x21, #0x0]\n"
+      "b 151f\n"
+      "145:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x11, #1, 146f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "tbz x11, #0, 151f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "b 151f\n"
+      "146:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x11, #0, 151f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "b 151f\n"
+      "147:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x11, #2, 149f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 148f\n"
+      "ldr d10, [x28], #0x8\n"
+      "mov x19, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "tbz x11, #0, 151f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "b 151f\n"
+      "148:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x11, #0, 151f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "b 151f\n"
+      "149:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x11, #1, 150f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "tbz x11, #0, 151f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "b 151f\n"
+      "150:"  // Height 5: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "151:"  // Height 5: Partial accumulate: Done
+      "sub x28, x28, x19\n"
+      "b 153f\n"
+      "152:"  // Height 5: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q25, [x21, #0x0]\n"
+      "ldr q26, [x21, #0x10]\n"
+      "ldr q27, [x21, #0x20]\n"
+      "ldr q6, [x21, #0x30]\n"
+      "153:"  // Height 5: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 155f\n"
+      "154:"  // Height 5: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "155:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "156:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 157f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 158f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 158f\n"
+      "157:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "158:"  // Height 5: input setup done
+      "cmp x26, #0x4\n"
+      "blt 161f\n"
+      "ld1 { v0.4s }, [x25], #0x10\n"
+      "cmp x26, #0x8\n"
+      "blt 160f\n"
+      "159:"  // Height 5: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x24], #0x10\n"
+      "sub x26, x26, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v2.4s }, [x23], #0x10\n"
+      "cmp x26, #0x8\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ld1 { v3.4s }, [x22], #0x10\n"
+      "ld1 { v4.4s }, [x21], #0x10\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      "ldr q7, [x10, #0x10]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x25], #0x10\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      "bge 159b\n"
+      "160:"  // Height 5: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x24], #0x10\n"
+      "sub x26, x26, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v2.4s }, [x23], #0x10\n"
+      "ld1 { v3.4s }, [x22], #0x10\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ld1 { v4.4s }, [x21], #0x10\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      "ldr q7, [x10, #0x10]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      "161:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x26, 164f\n"
+      "cbz x26, 164f\n"
+      "tbz x26, #1, 162f\n"
+      "ldr d0, [x25], #0x8\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d2, [x23], #0x8\n"
+      "ldr d3, [x22], #0x8\n"
+      "ldr d4, [x21], #0x8\n"
+      "tbz x26, #0, 163f\n"
+      "ld1 { v0.s }[2], [x25]\n"
+      "ld1 { v1.s }[2], [x24]\n"
+      "ld1 { v2.s }[2], [x23]\n"
+      "ld1 { v3.s }[2], [x22]\n"
+      "ld1 { v4.s }[2], [x21]\n"
+      "b 163f\n"
+      "162:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x25, #0x0]\n"
+      "ldr s1, [x24, #0x0]\n"
+      "ldr s2, [x23, #0x0]\n"
+      "ldr s3, [x22, #0x0]\n"
+      "ldr s4, [x21, #0x0]\n"
+      "163:"  // Height 5: Multiply loop: Ragged operand read: Done
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      "164:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 156b\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "add x23, x24, x19, LSL #2\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "tbz %x[flags], #1, 165f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v6.4s, v6.4s, v0.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v6.4s, v6.4s, v1.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "165:"  // Height 5: No activation
+      "cmp x11, #0x10\n"
+      "bge 174f\n"
+      "tbz x11, #3, 169f\n"
+      "st1 { v6.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "st1 { v25.4s }, [x21], #0x10\n"
+      "tbz x11, #2, 167f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "st1 { v26.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 166f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "str d27, [x21], #0x8\n"
+      "tbz x11, #0, 173f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "st1 { v27.s }[2], [x21]\n"
+      "b 173f\n"
+      "166:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 173f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "str s27, [x21, #0x0]\n"
+      "b 173f\n"
+      "167:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 168f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "str d26, [x21], #0x8\n"
+      "tbz x11, #0, 173f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "st1 { v26.s }[2], [x21]\n"
+      "b 173f\n"
+      "168:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 173f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "str s26, [x21, #0x0]\n"
+      "b 173f\n"
+      "169:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 171f\n"
+      "st1 { v6.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v24.4s }, [x21], #0x10\n"
+      "tbz x11, #1, 170f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "str d25, [x21], #0x8\n"
+      "tbz x11, #0, 173f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "st1 { v25.s }[2], [x21]\n"
+      "b 173f\n"
+      "170:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 173f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "str s25, [x21, #0x0]\n"
+      "b 173f\n"
+      "171:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 172f\n"
+      "str d6, [x28], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "str d24, [x21], #0x8\n"
+      "tbz x11, #0, 173f\n"
+      "st1 { v6.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "st1 { v24.s }[2], [x21]\n"
+      "b 173f\n"
+      "172:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s6, [x28, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "str s24, [x21, #0x0]\n"
+      "173:"  // Height 5: Partial direct writeback: Done
+      "b 175f\n"
+      "174:"  // Height 5: Full writeback
+      "str q6, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q15, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q22, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "str q24, [x21, #0x0]\n"
+      "str q25, [x21, #0x10]\n"
+      "str q26, [x21, #0x20]\n"
+      "str q27, [x21, #0x30]\n"
+      "175:"  // Height 5: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 142b\n"
+      "b 212f\n"
+      "176:"  // Height 6
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x19, #0x18\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "177:"  // Height 6: Column loop
+      "cbz x9, 178f\n"
+      "ldr q8, [x9, #0x0]\n"
+      "zip2 v12.2d, v8.2d, v8.2d\n"
+      "ldr q9, [x9, #0x10]\n"
+      "zip1 v8.2d, v8.2d, v8.2d\n"
+      "ldr q10, [x9, #0x20]\n"
+      "mov v16.16b, v8.16b\n"
+      "ldr q11, [x9, #0x30]\n"
+      "add x9, x9, #0x40\n"
+      "mov v20.16b, v12.16b\n"
+      "mov v24.16b, v8.16b\n"
+      "zip2 v13.2d, v9.2d, v9.2d\n"
+      "zip1 v9.2d, v9.2d, v9.2d\n"
+      "zip2 v14.2d, v10.2d, v10.2d\n"
+      "zip1 v10.2d, v10.2d, v10.2d\n"
+      "zip2 v15.2d, v11.2d, v11.2d\n"
+      "zip1 v11.2d, v11.2d, v11.2d\n"
+      "mov v17.16b, v9.16b\n"
+      "mov v21.16b, v13.16b\n"
+      "mov v18.16b, v10.16b\n"
+      "mov v22.16b, v14.16b\n"
+      "mov v19.16b, v11.16b\n"
+      "mov v23.16b, v15.16b\n"
+      "mov v28.16b, v12.16b\n"
+      "mov v25.16b, v9.16b\n"
+      "mov v29.16b, v13.16b\n"
+      "mov v26.16b, v10.16b\n"
+      "mov v30.16b, v14.16b\n"
+      "mov v27.16b, v11.16b\n"
+      "mov v31.16b, v15.16b\n"
+      "b 190f\n"
+      "178:"  // Height 6: no bias
+      "tbz %x[flags], #0, 189f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x11, #0x10\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "bge 187f\n"
+      "tbz x11, #3, 182f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x24], #0x10\n"
+      "ld1 { v18.4s }, [x23], #0x10\n"
+      "ld1 { v21.4s }, [x22], #0x10\n"
+      "ld1 { v26.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "ld1 { v29.4s }, [x20], #0x10\n"
+      "tbz x11, #2, 180f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x24], #0x10\n"
+      "ld1 { v19.4s }, [x23], #0x10\n"
+      "ld1 { v22.4s }, [x22], #0x10\n"
+      "ld1 { v27.4s }, [x21], #0x10\n"
+      "ld1 { v30.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 179f\n"
+      "ldr d16, [x28], #0x8\n"
+      "mov x19, #0x38\n"
+      "ldr d15, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ldr d23, [x22], #0x8\n"
+      "ldr d6, [x21], #0x8\n"
+      "ldr d31, [x20], #0x8\n"
+      "tbz x11, #0, 186f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x24]\n"
+      "ld1 { v24.s }[2], [x23]\n"
+      "ld1 { v23.s }[2], [x22]\n"
+      "ld1 { v6.s }[2], [x21]\n"
+      "ld1 { v31.s }[2], [x20]\n"
+      "b 186f\n"
+      "179:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x19, #0x30\n"
+      "tbz x11, #0, 186f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x24, #0x0]\n"
+      "ldr s24, [x23, #0x0]\n"
+      "ldr s23, [x22, #0x0]\n"
+      "ldr s6, [x21, #0x0]\n"
+      "ldr s31, [x20, #0x0]\n"
+      "b 186f\n"
+      "180:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x11, #1, 181f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x24], #0x8\n"
+      "mov x19, #0x28\n"
+      "ldr d19, [x23], #0x8\n"
+      "ldr d22, [x22], #0x8\n"
+      "ldr d27, [x21], #0x8\n"
+      "ldr d30, [x20], #0x8\n"
+      "tbz x11, #0, 186f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x24]\n"
+      "ld1 { v19.s }[2], [x23]\n"
+      "ld1 { v22.s }[2], [x22]\n"
+      "ld1 { v27.s }[2], [x21]\n"
+      "ld1 { v30.s }[2], [x20]\n"
+      "b 186f\n"
+      "181:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x19, #0x20\n"
+      "tbz x11, #0, 186f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x24, #0x0]\n"
+      "ldr s19, [x23, #0x0]\n"
+      "ldr s22, [x22, #0x0]\n"
+      "ldr s27, [x21, #0x0]\n"
+      "ldr s30, [x20, #0x0]\n"
+      "b 186f\n"
+      "182:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x11, #2, 184f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x24], #0x10\n"
+      "ld1 { v17.4s }, [x23], #0x10\n"
+      "ld1 { v20.4s }, [x22], #0x10\n"
+      "ld1 { v25.4s }, [x21], #0x10\n"
+      "ld1 { v28.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 183f\n"
+      "ldr d10, [x28], #0x8\n"
+      "mov x19, #0x18\n"
+      "ldr d13, [x24], #0x8\n"
+      "ldr d18, [x23], #0x8\n"
+      "ldr d21, [x22], #0x8\n"
+      "ldr d26, [x21], #0x8\n"
+      "ldr d29, [x20], #0x8\n"
+      "tbz x11, #0, 186f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x24]\n"
+      "ld1 { v18.s }[2], [x23]\n"
+      "ld1 { v21.s }[2], [x22]\n"
+      "ld1 { v26.s }[2], [x21]\n"
+      "ld1 { v29.s }[2], [x20]\n"
+      "b 186f\n"
+      "183:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x19, #0x10\n"
+      "tbz x11, #0, 186f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x24, #0x0]\n"
+      "ldr s18, [x23, #0x0]\n"
+      "ldr s21, [x22, #0x0]\n"
+      "ldr s26, [x21, #0x0]\n"
+      "ldr s29, [x20, #0x0]\n"
+      "b 186f\n"
+      "184:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x11, #1, 185f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x24], #0x8\n"
+      "mov x19, #0x8\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d25, [x21], #0x8\n"
+      "ldr d28, [x20], #0x8\n"
+      "tbz x11, #0, 186f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x24]\n"
+      "ld1 { v17.s }[2], [x23]\n"
+      "ld1 { v20.s }[2], [x22]\n"
+      "ld1 { v25.s }[2], [x21]\n"
+      "ld1 { v28.s }[2], [x20]\n"
+      "b 186f\n"
+      "185:"  // Height 6: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x19, #0x0\n"
+      "ldr s12, [x24, #0x0]\n"
+      "ldr s17, [x23, #0x0]\n"
+      "ldr s20, [x22, #0x0]\n"
+      "ldr s25, [x21, #0x0]\n"
+      "ldr s28, [x20, #0x0]\n"
+      "186:"  // Height 6: Partial accumulate: Done
+      "sub x28, x28, x19\n"
+      "b 188f\n"
+      "187:"  // Height 6: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x24, #0x0]\n"
+      "ldr q13, [x24, #0x10]\n"
+      "ldr q14, [x24, #0x20]\n"
+      "ldr q15, [x24, #0x30]\n"
+      "ldr q17, [x23, #0x0]\n"
+      "ldr q18, [x23, #0x10]\n"
+      "ldr q19, [x23, #0x20]\n"
+      "ldr q24, [x23, #0x30]\n"
+      "ldr q20, [x22, #0x0]\n"
+      "ldr q21, [x22, #0x10]\n"
+      "ldr q22, [x22, #0x20]\n"
+      "ldr q23, [x22, #0x30]\n"
+      "ldr q25, [x21, #0x0]\n"
+      "ldr q26, [x21, #0x10]\n"
+      "ldr q27, [x21, #0x20]\n"
+      "ldr q6, [x21, #0x30]\n"
+      "ldr q28, [x20, #0x0]\n"
+      "ldr q29, [x20, #0x10]\n"
+      "ldr q30, [x20, #0x20]\n"
+      "ldr q31, [x20, #0x30]\n"
+      "188:"  // Height 6: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 190f\n"
+      "189:"  // Height 6: no accumulate
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "190:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "191:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 192f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 193f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 193f\n"
+      "192:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "193:"  // Height 6: input setup done
+      "cmp x26, #0x4\n"
+      "blt 196f\n"
+      "ld1 { v0.4s }, [x25], #0x10\n"
+      "cmp x26, #0x8\n"
+      "blt 195f\n"
+      "194:"  // Height 6: Multiply loop: Main loop head
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x24], #0x10\n"
+      "sub x26, x26, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v2.4s }, [x23], #0x10\n"
+      "cmp x26, #0x8\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ld1 { v3.4s }, [x22], #0x10\n"
+      "ld1 { v4.4s }, [x21], #0x10\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      "ld1 { v5.4s }, [x20], #0x10\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x4ea168a4  // bfcvtn2 v4.8h, v5.4s\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      "ld1 { v0.4s }, [x25], #0x10\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      "bge 194b\n"
+      "195:"  // Height 6: Multiply loop: Single iteration only
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ld1 { v1.4s }, [x24], #0x10\n"
+      "sub x26, x26, #0x4\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      "ld1 { v2.4s }, [x23], #0x10\n"
+      "ld1 { v3.4s }, [x22], #0x10\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ld1 { v4.4s }, [x21], #0x10\n"
+      "ld1 { v5.4s }, [x20], #0x10\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4ea168a4  // bfcvtn2 v4.8h, v5.4s\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      "196:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x26, 199f\n"
+      "cbz x26, 199f\n"
+      "tbz x26, #1, 197f\n"
+      "ldr d0, [x25], #0x8\n"
+      "ldr d1, [x24], #0x8\n"
+      "ldr d2, [x23], #0x8\n"
+      "ldr d3, [x22], #0x8\n"
+      "ldr d4, [x21], #0x8\n"
+      "ldr d5, [x20], #0x8\n"
+      "tbz x26, #0, 198f\n"
+      "ld1 { v0.s }[2], [x25]\n"
+      "ld1 { v1.s }[2], [x24]\n"
+      "ld1 { v2.s }[2], [x23]\n"
+      "ld1 { v3.s }[2], [x22]\n"
+      "ld1 { v4.s }[2], [x21]\n"
+      "ld1 { v5.s }[2], [x20]\n"
+      "b 198f\n"
+      "197:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr s0, [x25, #0x0]\n"
+      "ldr s1, [x24, #0x0]\n"
+      "ldr s2, [x23, #0x0]\n"
+      "ldr s3, [x22, #0x0]\n"
+      "ldr s4, [x21, #0x0]\n"
+      "ldr s5, [x20, #0x0]\n"
+      "198:"  // Height 6: Multiply loop: Ragged operand read: Done
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "ldr q6, [x10, #0x0]\n"
+      ".inst 0x0ea16842  // bfcvtn v2.4h, v2.4s\n"
+      "ldr q7, [x10, #0x10]\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      ".inst 0x4ea16820  // bfcvtn2 v0.8h, v1.4s\n"
+      ".inst 0x4ea16862  // bfcvtn2 v2.8h, v3.4s\n"
+      ".inst 0x4ea168a4  // bfcvtn2 v4.8h, v5.4s\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec50  // bfmmla v16.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec98  // bfmmla v24.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x20]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec54  // bfmmla v20.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9c  // bfmmla v28.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec51  // bfmmla v17.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec99  // bfmmla v25.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x40]\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec55  // bfmmla v21.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9d  // bfmmla v29.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x50]\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec52  // bfmmla v18.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9a  // bfmmla v26.4s, v4.8h, v6.8h\n"
+      "ldr q6, [x10, #0x60]\n"
+      ".inst 0x6e47ec0e  // bfmmla v14.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec56  // bfmmla v22.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9e  // bfmmla v30.4s, v4.8h, v7.8h\n"
+      "ldr q7, [x10, #0x70]\n"
+      "add x10, x10, #0x80\n"
+      ".inst 0x6e46ec0b  // bfmmla v11.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec53  // bfmmla v19.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec9b  // bfmmla v27.4s, v4.8h, v6.8h\n"
+      ".inst 0x6e47ec0f  // bfmmla v15.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec9f  // bfmmla v31.4s, v4.8h, v7.8h\n"
+      "199:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 191b\n"
+      "uzp1 v6.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x24, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "add x23, x24, x19, LSL #2\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "tbz %x[flags], #1, 200f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1r { v1.4s }, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1r { v0.4s }, [x19]\n"
+      "fmin v6.4s, v6.4s, v0.4s\n"
+      "fmin v12.4s, v12.4s, v0.4s\n"
+      "fmin v13.4s, v13.4s, v0.4s\n"
+      "fmin v14.4s, v14.4s, v0.4s\n"
+      "fmax v6.4s, v6.4s, v1.4s\n"
+      "fmax v12.4s, v12.4s, v1.4s\n"
+      "fmax v13.4s, v13.4s, v1.4s\n"
+      "fmax v14.4s, v14.4s, v1.4s\n"
+      "fmin v8.4s, v8.4s, v0.4s\n"
+      "fmin v9.4s, v9.4s, v0.4s\n"
+      "fmin v10.4s, v10.4s, v0.4s\n"
+      "fmax v8.4s, v8.4s, v1.4s\n"
+      "fmax v9.4s, v9.4s, v1.4s\n"
+      "fmax v10.4s, v10.4s, v1.4s\n"
+      "fmin v11.4s, v11.4s, v0.4s\n"
+      "fmin v15.4s, v15.4s, v0.4s\n"
+      "fmin v20.4s, v20.4s, v0.4s\n"
+      "fmax v11.4s, v11.4s, v1.4s\n"
+      "fmax v15.4s, v15.4s, v1.4s\n"
+      "fmax v20.4s, v20.4s, v1.4s\n"
+      "fmin v21.4s, v21.4s, v0.4s\n"
+      "fmin v22.4s, v22.4s, v0.4s\n"
+      "fmin v16.4s, v16.4s, v0.4s\n"
+      "fmax v21.4s, v21.4s, v1.4s\n"
+      "fmax v22.4s, v22.4s, v1.4s\n"
+      "fmax v16.4s, v16.4s, v1.4s\n"
+      "fmin v17.4s, v17.4s, v0.4s\n"
+      "fmin v18.4s, v18.4s, v0.4s\n"
+      "fmin v19.4s, v19.4s, v0.4s\n"
+      "fmax v17.4s, v17.4s, v1.4s\n"
+      "fmax v18.4s, v18.4s, v1.4s\n"
+      "fmax v19.4s, v19.4s, v1.4s\n"
+      "fmin v23.4s, v23.4s, v0.4s\n"
+      "fmin v28.4s, v28.4s, v0.4s\n"
+      "fmin v29.4s, v29.4s, v0.4s\n"
+      "fmax v23.4s, v23.4s, v1.4s\n"
+      "fmax v28.4s, v28.4s, v1.4s\n"
+      "fmax v29.4s, v29.4s, v1.4s\n"
+      "fmin v30.4s, v30.4s, v0.4s\n"
+      "fmin v24.4s, v24.4s, v0.4s\n"
+      "fmin v25.4s, v25.4s, v0.4s\n"
+      "fmax v30.4s, v30.4s, v1.4s\n"
+      "fmax v24.4s, v24.4s, v1.4s\n"
+      "fmax v25.4s, v25.4s, v1.4s\n"
+      "fmin v26.4s, v26.4s, v0.4s\n"
+      "fmin v27.4s, v27.4s, v0.4s\n"
+      "fmax v26.4s, v26.4s, v1.4s\n"
+      "fmax v27.4s, v27.4s, v1.4s\n"
+      "200:"  // Height 6: No activation
+      "cmp x11, #0x10\n"
+      "bge 209f\n"
+      "tbz x11, #3, 204f\n"
+      "st1 { v6.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v9.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v20.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x21], #0x10\n"
+      "st1 { v28.4s }, [x21], #0x10\n"
+      "st1 { v24.4s }, [x20], #0x10\n"
+      "st1 { v25.4s }, [x20], #0x10\n"
+      "tbz x11, #2, 202f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x24], #0x10\n"
+      "st1 { v21.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "st1 { v29.4s }, [x21], #0x10\n"
+      "st1 { v26.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 201f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x24], #0x8\n"
+      "str d22, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "str d30, [x21], #0x8\n"
+      "str d27, [x20], #0x8\n"
+      "tbz x11, #0, 208f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x24]\n"
+      "st1 { v22.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "st1 { v30.s }[2], [x21]\n"
+      "st1 { v27.s }[2], [x20]\n"
+      "b 208f\n"
+      "201:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x11, #0, 208f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x24, #0x0]\n"
+      "str s22, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "str s30, [x21, #0x0]\n"
+      "str s27, [x20, #0x0]\n"
+      "b 208f\n"
+      "202:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x11, #1, 203f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x24], #0x8\n"
+      "str d21, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "str d29, [x21], #0x8\n"
+      "str d26, [x20], #0x8\n"
+      "tbz x11, #0, 208f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x24]\n"
+      "st1 { v21.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "st1 { v29.s }[2], [x21]\n"
+      "st1 { v26.s }[2], [x20]\n"
+      "b 208f\n"
+      "203:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x11, #0, 208f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x24, #0x0]\n"
+      "str s21, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "str s29, [x21, #0x0]\n"
+      "str s26, [x20, #0x0]\n"
+      "b 208f\n"
+      "204:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x11, #2, 206f\n"
+      "st1 { v6.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x24], #0x10\n"
+      "st1 { v15.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v23.4s }, [x21], #0x10\n"
+      "st1 { v24.4s }, [x20], #0x10\n"
+      "tbz x11, #1, 205f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x24], #0x8\n"
+      "str d20, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "str d28, [x21], #0x8\n"
+      "str d25, [x20], #0x8\n"
+      "tbz x11, #0, 208f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x24]\n"
+      "st1 { v20.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "st1 { v28.s }[2], [x21]\n"
+      "st1 { v25.s }[2], [x20]\n"
+      "b 208f\n"
+      "205:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x11, #0, 208f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x24, #0x0]\n"
+      "str s20, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "str s28, [x21, #0x0]\n"
+      "str s25, [x20, #0x0]\n"
+      "b 208f\n"
+      "206:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x11, #1, 207f\n"
+      "str d6, [x28], #0x8\n"
+      "str d8, [x24], #0x8\n"
+      "str d15, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "str d23, [x21], #0x8\n"
+      "str d24, [x20], #0x8\n"
+      "tbz x11, #0, 208f\n"
+      "st1 { v6.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x24]\n"
+      "st1 { v15.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "st1 { v23.s }[2], [x21]\n"
+      "st1 { v24.s }[2], [x20]\n"
+      "b 208f\n"
+      "207:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s6, [x28, #0x0]\n"
+      "str s8, [x24, #0x0]\n"
+      "str s15, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "str s23, [x21, #0x0]\n"
+      "str s24, [x20, #0x0]\n"
+      "208:"  // Height 6: Partial direct writeback: Done
+      "b 210f\n"
+      "209:"  // Height 6: Full writeback
+      "str q6, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x24, #0x0]\n"
+      "str q9, [x24, #0x10]\n"
+      "str q10, [x24, #0x20]\n"
+      "str q11, [x24, #0x30]\n"
+      "str q15, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q22, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "str q23, [x21, #0x0]\n"
+      "str q28, [x21, #0x10]\n"
+      "str q29, [x21, #0x20]\n"
+      "str q30, [x21, #0x30]\n"
+      "str q24, [x20, #0x0]\n"
+      "str q25, [x20, #0x10]\n"
+      "str q26, [x20, #0x20]\n"
+      "str q27, [x20, #0x30]\n"
+      "210:"  // Height 6: Writeback done
+      "subs x11, x11, #0x10\n"
+      "bgt 177b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 212f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 211f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "211:"  // Update direct input
+      "mov x19, #0x18\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "212:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index caef6396be..94f5783686 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -22,8 +22,8 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef __aarch64__
 
+#ifdef __aarch64__
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -44,7 +44,8 @@ void a64_hybrid_s8qa_dot_4x16_a55( ARGLIST );
 class cls_a64_hybrid_s8qa_dot_4x16
 {
 public:
-    typedef int8_t operand_type;
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
     typedef int8_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,16 +71,24 @@ public:
         return false;
     }
 
-    StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixed<rhs_operand_type, result_type, 4, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 7.5301 };
-            default:
-                return { 27.5482 };
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 7.5301 };
+                default:
+                    return { 27.5482 };
+                case CPUModel::A510:
+                    return { 14.81 };
+                case CPUModel::V1:
+                    return { 48.34 };
+            }
         }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
@@ -99,4 +108,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
index 11aa05a9b7..ee7e55f179 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp
@@ -406,10 +406,10 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       "b 122f\n"
       "31:"  // Height 2
       "movi v11.4s, #0x0\n"
-      "movi v12.4s, #0x0\n"
-      "movi v15.16b, #0x1\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "movi v12.4s, #0x0\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "movi v15.16b, #0x1\n"
       "mov x9, %x[col_bias]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "mov x28, %x[output_ptr]\n"
@@ -853,12 +853,12 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       "b 122f\n"
       "61:"  // Height 3
       "movi v11.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v12.4s, #0x0\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "movi v13.4s, #0x0\n"
-      "movi v15.16b, #0x1\n"
-      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
       "mov x9, %x[col_bias]\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "movi v15.16b, #0x1\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "mov x28, %x[output_ptr]\n"
       "62:"  // Height 3: Column loop
@@ -1426,14 +1426,14 @@ void a64_hybrid_s8qa_dot_4x16_a55 (
       "b 122f\n"
       "91:"  // Height 4
       "movi v11.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v12.4s, #0x0\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "movi v13.4s, #0x0\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "movi v14.4s, #0x0\n"
-      "movi v15.16b, #0x1\n"
-      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "mov x9, %x[col_bias]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v15.16b, #0x1\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "mov x28, %x[output_ptr]\n"
       "mov x19, #0x4\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
index 0adfb99f23..a1c4b34d38 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
@@ -283,16 +283,16 @@ void a64_hybrid_s8qa_dot_4x16 (
       "sqrdmulh v19.4s, v19.4s, v4.4s\n"
       "tbz %x[flags], #5, 20f\n"
       "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v17.16b, v0.16b\n"
       "and v6.16b, v18.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "and v7.16b, v19.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v17.4s, v17.4s, v5.4s\n"
       "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v19.4s, v19.4s, v7.4s\n"
       "20:"  // Height 1: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
@@ -612,8 +612,8 @@ void a64_hybrid_s8qa_dot_4x16 (
       "ld1r { v2.4s }, [x22]\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "addp v12.4s, v12.4s, v12.4s\n"
       "neg v2.4s, v2.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
       "mul v11.4s, v11.4s, v2.4s\n"
       "mul v12.4s, v12.4s, v2.4s\n"
       "49:"  // Height 2: skip row sum fixup
@@ -653,27 +653,27 @@ void a64_hybrid_s8qa_dot_4x16 (
       "sqrdmulh v23.4s, v23.4s, v4.4s\n"
       "tbz %x[flags], #5, 50f\n"
       "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v17.16b, v0.16b\n"
       "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
       "and v7.16b, v19.16b, v0.16b\n"
       "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "and v9.16b, v21.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v10.16b, v22.16b, v0.16b\n"
       "sshr v8.4s, v8.4s, #0x1f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
       "sshr v9.4s, v9.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v19.4s, v19.4s, v7.4s\n"
       "sqadd v20.4s, v20.4s, v8.4s\n"
       "sqadd v21.4s, v21.4s, v9.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v22.4s, v22.4s, v10.4s\n"
       "sqadd v23.4s, v23.4s, v4.4s\n"
       "50:"  // Height 2: no shift correction
@@ -690,8 +690,6 @@ void a64_hybrid_s8qa_dot_4x16 (
       "cmp x9, #0x10\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
-      "srshl v22.4s, v22.4s, v0.4s\n"
-      "srshl v23.4s, v23.4s, v0.4s\n"
       "add v16.4s, v16.4s, v4.4s\n"
       "add v17.4s, v17.4s, v4.4s\n"
       "add v18.4s, v18.4s, v4.4s\n"
@@ -710,16 +708,18 @@ void a64_hybrid_s8qa_dot_4x16 (
       "smax v19.4s, v19.4s, v5.4s\n"
       "smax v20.4s, v20.4s, v5.4s\n"
       "smax v21.4s, v21.4s, v5.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
       "add v22.4s, v22.4s, v4.4s\n"
       "add v23.4s, v23.4s, v4.4s\n"
-      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
       "smin v22.4s, v22.4s, v6.4s\n"
       "smin v23.4s, v23.4s, v6.4s\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
       "smax v22.4s, v22.4s, v5.4s\n"
       "smax v23.4s, v23.4s, v5.4s\n"
-      "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
       "uzp1 v21.8h, v22.8h, v23.8h\n"
       "uzp1 v20.16b, v20.16b, v21.16b\n"
       "bge 59f\n"
@@ -1094,9 +1094,9 @@ void a64_hybrid_s8qa_dot_4x16 (
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
+      "neg v3.4s, v3.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "neg v3.4s, v3.4s\n"
       "mul v11.4s, v11.4s, v3.4s\n"
       "mul v12.4s, v12.4s, v3.4s\n"
       "mul v13.4s, v13.4s, v3.4s\n"
@@ -1149,39 +1149,39 @@ void a64_hybrid_s8qa_dot_4x16 (
       "sqrdmulh v27.4s, v27.4s, v4.4s\n"
       "tbz %x[flags], #5, 80f\n"
       "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v17.16b, v0.16b\n"
       "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
       "and v7.16b, v19.16b, v0.16b\n"
       "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "and v9.16b, v21.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v10.16b, v22.16b, v0.16b\n"
       "sshr v8.4s, v8.4s, #0x1f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
       "sshr v9.4s, v9.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v19.4s, v19.4s, v7.4s\n"
       "sqadd v20.4s, v20.4s, v8.4s\n"
       "sqadd v21.4s, v21.4s, v9.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v22.4s, v22.4s, v10.4s\n"
       "sqadd v23.4s, v23.4s, v4.4s\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
       "and v7.16b, v26.16b, v0.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "and v8.16b, v27.16b, v0.16b\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
       "sqadd v26.4s, v26.4s, v7.4s\n"
       "sqadd v27.4s, v27.4s, v8.4s\n"
       "80:"  // Height 3: no shift correction
@@ -1198,8 +1198,6 @@ void a64_hybrid_s8qa_dot_4x16 (
       "cmp x9, #0x10\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
-      "srshl v22.4s, v22.4s, v0.4s\n"
-      "srshl v23.4s, v23.4s, v0.4s\n"
       "add v16.4s, v16.4s, v4.4s\n"
       "add v17.4s, v17.4s, v4.4s\n"
       "add v18.4s, v18.4s, v4.4s\n"
@@ -1218,31 +1216,33 @@ void a64_hybrid_s8qa_dot_4x16 (
       "smax v19.4s, v19.4s, v5.4s\n"
       "smax v20.4s, v20.4s, v5.4s\n"
       "smax v21.4s, v21.4s, v5.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
       "add v22.4s, v22.4s, v4.4s\n"
       "add v23.4s, v23.4s, v4.4s\n"
-      "srshl v24.4s, v24.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
       "smin v22.4s, v22.4s, v6.4s\n"
       "smin v23.4s, v23.4s, v6.4s\n"
-      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
       "smax v22.4s, v22.4s, v5.4s\n"
       "smax v23.4s, v23.4s, v5.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
       "add v25.4s, v25.4s, v4.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
       "add v26.4s, v26.4s, v4.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
       "add v27.4s, v27.4s, v4.4s\n"
-      "uzp1 v16.8h, v16.8h, v17.8h\n"
       "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
       "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
       "uzp1 v21.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
       "uzp1 v25.8h, v26.8h, v27.8h\n"
@@ -1705,10 +1705,10 @@ void a64_hybrid_s8qa_dot_4x16 (
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
+      "neg v4.4s, v4.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "neg v4.4s, v4.4s\n"
       "mul v11.4s, v11.4s, v4.4s\n"
       "mul v12.4s, v12.4s, v4.4s\n"
       "mul v13.4s, v13.4s, v4.4s\n"
@@ -1774,52 +1774,52 @@ void a64_hybrid_s8qa_dot_4x16 (
       "sqrdmulh v31.4s, v31.4s, v4.4s\n"
       "tbz %x[flags], #5, 110f\n"
       "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v17.16b, v0.16b\n"
       "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
       "and v7.16b, v19.16b, v0.16b\n"
       "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "and v9.16b, v21.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v10.16b, v22.16b, v0.16b\n"
       "sshr v8.4s, v8.4s, #0x1f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
       "sshr v9.4s, v9.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v19.4s, v19.4s, v7.4s\n"
       "sqadd v20.4s, v20.4s, v8.4s\n"
       "sqadd v21.4s, v21.4s, v9.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v22.4s, v22.4s, v10.4s\n"
       "sqadd v23.4s, v23.4s, v4.4s\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
       "and v7.16b, v26.16b, v0.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "and v8.16b, v27.16b, v0.16b\n"
-      "and v9.16b, v28.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sshr v8.4s, v8.4s, #0x1f\n"
       "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sqadd v26.4s, v26.4s, v7.4s\n"
+      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "and v9.16b, v28.16b, v0.16b\n"
       "and v10.16b, v29.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
       "and v4.16b, v30.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
       "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "and v5.16b, v31.16b, v0.16b\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v28.4s, v28.4s, v9.4s\n"
       "sqadd v29.4s, v29.4s, v10.4s\n"
       "sqadd v30.4s, v30.4s, v4.4s\n"
+      "and v5.16b, v31.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v31.4s, v31.4s, v5.4s\n"
       "110:"  // Height 4: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
@@ -1835,8 +1835,6 @@ void a64_hybrid_s8qa_dot_4x16 (
       "cmp x9, #0x10\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
-      "srshl v22.4s, v22.4s, v0.4s\n"
-      "srshl v23.4s, v23.4s, v0.4s\n"
       "add v16.4s, v16.4s, v4.4s\n"
       "add v17.4s, v17.4s, v4.4s\n"
       "add v18.4s, v18.4s, v4.4s\n"
@@ -1855,45 +1853,47 @@ void a64_hybrid_s8qa_dot_4x16 (
       "smax v19.4s, v19.4s, v5.4s\n"
       "smax v20.4s, v20.4s, v5.4s\n"
       "smax v21.4s, v21.4s, v5.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
       "add v22.4s, v22.4s, v4.4s\n"
       "add v23.4s, v23.4s, v4.4s\n"
-      "srshl v24.4s, v24.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
       "smin v22.4s, v22.4s, v6.4s\n"
       "smin v23.4s, v23.4s, v6.4s\n"
-      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
       "smax v22.4s, v22.4s, v5.4s\n"
       "smax v23.4s, v23.4s, v5.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
       "add v25.4s, v25.4s, v4.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v28.4s, v28.4s, v0.4s\n"
       "add v26.4s, v26.4s, v4.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
       "add v27.4s, v27.4s, v4.4s\n"
-      "srshl v28.4s, v28.4s, v0.4s\n"
       "smin v26.4s, v26.4s, v6.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
       "smin v27.4s, v27.4s, v6.4s\n"
-      "srshl v29.4s, v29.4s, v0.4s\n"
       "smax v26.4s, v26.4s, v5.4s\n"
+      "smin v28.4s, v28.4s, v6.4s\n"
       "smax v27.4s, v27.4s, v5.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
+      "srshl v29.4s, v29.4s, v0.4s\n"
+      "smax v28.4s, v28.4s, v5.4s\n"
       "srshl v30.4s, v30.4s, v0.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
       "add v30.4s, v30.4s, v4.4s\n"
+      "smin v29.4s, v29.4s, v6.4s\n"
       "add v31.4s, v31.4s, v4.4s\n"
-      "uzp1 v16.8h, v16.8h, v17.8h\n"
       "smin v30.4s, v30.4s, v6.4s\n"
+      "smax v29.4s, v29.4s, v5.4s\n"
       "smin v31.4s, v31.4s, v6.4s\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
       "smax v30.4s, v30.4s, v5.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
       "smax v31.4s, v31.4s, v5.4s\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
       "uzp1 v21.8h, v22.8h, v23.8h\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
new file mode 100644
index 0000000000..bc933afd9a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<int8_t>, \
+    size_t, size_t, \
+    const int8_t *, \
+    IndirectOutputArg<int8_t>, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_s8qa_mmla_4x16( ARGLIST );
+
+class cls_a64_hybrid_s8qa_mmla_4x16
+{
+public:
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 4, 16, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 47.74 };
+                case CPUModel::A510:
+                    return { 27.99 };
+                case CPUModel::V1:
+                    return { 68.76 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_s8qa_mmla_4x16;
+    cls_a64_hybrid_s8qa_mmla_4x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp
new file mode 100644
index 0000000000..4bc807cd8e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp
@@ -0,0 +1,2104 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qa_mmla_4x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 97f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 65f\n"
+      "beq 33f\n"
+      "movi v11.4s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "movi v15.16b, #0x1\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[col_bias]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov x26, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "3:"  // Height 1: setup done
+      "mov x25, #0x0\n"
+      "4:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 5f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "cbnz x25, 6f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "b 6f\n"
+      "5:"  // Height 1: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "6:"  // Height 1: input setup done
+      "cmp x24, #0x10\n"
+      "blt 11f\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "cmp x24, #0x20\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      "blt 9f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 8f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      "8:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "sub x24, x24, #0x10\n"
+      "ldr q1, [x23, #0x0]\n"
+      "cmp x24, #0x20\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      "bge 7b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "sub x24, x24, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 10f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      "10:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "11:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x24, 20f\n"
+      "cmp x24, #0x8\n"
+      "blt 14f\n"
+      "12:"  // Height 1: Multiply loop: Odd block loop
+      "movi v2.16b, #0x0\n"
+      "ldr d1, [x23], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "tbnz %x[flags], #31, 13f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "13:"  // Height 1: Multiply loop: unique 3: skip row sum
+      "ldr q8, [x28, #0x0]\n"
+      ".inst 0x4e88a410  // smmla v16.4s, v0.16b, v8.16b\n"
+      "ldr q9, [x28, #0x10]\n"
+      "sub x24, x24, #0x8\n"
+      ".inst 0x4e89a414  // smmla v20.4s, v0.16b, v9.16b\n"
+      "ldr q10, [x28, #0x20]\n"
+      "cmp x24, #0x8\n"
+      ".inst 0x4e8aa411  // smmla v17.4s, v0.16b, v10.16b\n"
+      "ldr q4, [x28, #0x30]\n"
+      "ldr q5, [x28, #0x40]\n"
+      ".inst 0x4e84a415  // smmla v21.4s, v0.16b, v4.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      "ldr q8, [x28, #0x70]\n"
+      ".inst 0x4e86a416  // smmla v22.4s, v0.16b, v6.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e87a413  // smmla v19.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e88a417  // smmla v23.4s, v0.16b, v8.16b\n"
+      "bge 12b\n"
+      "cbz x24, 20f\n"
+      "14:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x24, #2, 16f\n"
+      "ldr s1, [x23], #0x4\n"
+      "tbz x24, #1, 15f\n"
+      "ld1 { v1.h }[2], [x23], #0x2\n"
+      "tbz x24, #0, 18f\n"
+      "ld1 { v1.b }[6], [x23]\n"
+      "b 18f\n"
+      "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x24, #0, 18f\n"
+      "ld1 { v1.b }[4], [x23]\n"
+      "b 18f\n"
+      "16:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x24, #1, 17f\n"
+      "ldr h1, [x23], #0x2\n"
+      "tbz x24, #0, 18f\n"
+      "ld1 { v1.b }[2], [x23]\n"
+      "b 18f\n"
+      "17:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x23, #0x0]\n"
+      "18:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "movi v2.16b, #0x0\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "tbnz %x[flags], #31, 19f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "19:"  // Height 1: Multiply loop: unique 4: skip row sum
+      "ldr q10, [x28, #0x0]\n"
+      ".inst 0x4e8aa410  // smmla v16.4s, v0.16b, v10.16b\n"
+      "ldr q4, [x28, #0x10]\n"
+      "ldr q5, [x28, #0x20]\n"
+      ".inst 0x4e84a414  // smmla v20.4s, v0.16b, v4.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e85a411  // smmla v17.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      "ldr q8, [x28, #0x50]\n"
+      ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
+      "ldr q9, [x28, #0x60]\n"
+      "ldr q10, [x28, #0x70]\n"
+      ".inst 0x4e87a412  // smmla v18.4s, v0.16b, v7.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e88a416  // smmla v22.4s, v0.16b, v8.16b\n"
+      ".inst 0x4e89a413  // smmla v19.4s, v0.16b, v9.16b\n"
+      ".inst 0x4e8aa417  // smmla v23.4s, v0.16b, v10.16b\n"
+      "20:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 4b\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "mov v23.16b, v16.16b\n"
+      "tbnz %x[flags], #31, 21f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1r { v1.4s }, [x22]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "neg v1.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v1.4s\n"
+      "21:"  // Height 1: skip row sum fixup
+      "add v23.4s, v23.4s, v11.4s\n"
+      "ldr q0, [x27, #0x0]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q1, [x27, #0x10]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q2, [x27, #0x20]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q3, [x27, #0x30]\n"
+      "add x27, x27, #0x40\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x23]\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "tbz %x[flags], #5, 22f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "22:"  // Height 1: no shift correction
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x22, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x22]\n"
+      "cmp x9, #0x10\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "uzp1 v23.8h, v23.8h, v17.8h\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.16b, v23.16b, v17.16b\n"
+      "bge 31f\n"
+      "tbz x9, #3, 26f\n"
+      "str d23, [x26], #0x8\n"
+      "tbz x9, #2, 24f\n"
+      "st1 { v23.s }[2], [x26], #0x4\n"
+      "tbz x9, #1, 23f\n"
+      "st1 { v23.h }[6], [x26], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[14], [x26]\n"
+      "b 30f\n"
+      "23:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[12], [x26]\n"
+      "b 30f\n"
+      "24:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 25f\n"
+      "st1 { v23.h }[4], [x26], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[10], [x26]\n"
+      "b 30f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[8], [x26]\n"
+      "b 30f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 28f\n"
+      "str s23, [x26], #0x4\n"
+      "tbz x9, #1, 27f\n"
+      "st1 { v23.h }[2], [x26], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[6], [x26]\n"
+      "b 30f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[4], [x26]\n"
+      "b 30f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 29f\n"
+      "str h23, [x26], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[2], [x26]\n"
+      "b 30f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_1_0
+      "str b23, [x26, #0x0]\n"
+      "30:"  // Height 1: Partial direct writeback: Done
+      "b 32f\n"
+      "31:"  // Height 1: Full writeback
+      "str q23, [x26, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "32:"  // Height 1: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 2b\n"
+      "b 130f\n"
+      "33:"  // Height 2
+      "movi v11.4s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v15.16b, #0x1\n"
+      "mov x26, %x[output_ptr]\n"
+      "34:"  // Height 2: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "35:"  // Height 2: setup done
+      "mov x25, #0x0\n"
+      "36:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 37f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "cbnz x25, 38f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 38f\n"
+      "37:"  // Height 2: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19\n"
+      "38:"  // Height 2: input setup done
+      "cmp x24, #0x10\n"
+      "blt 43f\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "cmp x24, #0x20\n"
+      "blt 41f\n"
+      "39:"  // Height 2: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q5, [x28, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      "ldr q9, [x28, #0x40]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      "ldr q10, [x28, #0x50]\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 40f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      "40:"  // Height 2: Multiply loop: unique 5: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "sub x24, x24, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "cmp x24, #0x20\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "bge 39b\n"
+      "41:"  // Height 2: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q5, [x28, #0x0]\n"
+      "sub x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 42f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      "42:"  // Height 2: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "43:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x24, 52f\n"
+      "cmp x24, #0x8\n"
+      "blt 46f\n"
+      "44:"  // Height 2: Multiply loop: Odd block loop
+      "ldr d1, [x23], #0x8\n"
+      "ldr d2, [x22], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "tbnz %x[flags], #31, 45f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "45:"  // Height 2: Multiply loop: unique 7: skip row sum
+      "ldr q8, [x28, #0x0]\n"
+      ".inst 0x4e88a410  // smmla v16.4s, v0.16b, v8.16b\n"
+      "ldr q9, [x28, #0x10]\n"
+      "sub x24, x24, #0x8\n"
+      ".inst 0x4e89a414  // smmla v20.4s, v0.16b, v9.16b\n"
+      "ldr q10, [x28, #0x20]\n"
+      "cmp x24, #0x8\n"
+      ".inst 0x4e8aa411  // smmla v17.4s, v0.16b, v10.16b\n"
+      "ldr q4, [x28, #0x30]\n"
+      "ldr q5, [x28, #0x40]\n"
+      ".inst 0x4e84a415  // smmla v21.4s, v0.16b, v4.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      "ldr q8, [x28, #0x70]\n"
+      ".inst 0x4e86a416  // smmla v22.4s, v0.16b, v6.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e87a413  // smmla v19.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e88a417  // smmla v23.4s, v0.16b, v8.16b\n"
+      "bge 44b\n"
+      "cbz x24, 52f\n"
+      "46:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x24, #2, 48f\n"
+      "ldr s1, [x23], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "tbz x24, #1, 47f\n"
+      "ld1 { v1.h }[2], [x23], #0x2\n"
+      "ld1 { v2.h }[2], [x22], #0x2\n"
+      "tbz x24, #0, 50f\n"
+      "ld1 { v1.b }[6], [x23]\n"
+      "ld1 { v2.b }[6], [x22]\n"
+      "b 50f\n"
+      "47:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x24, #0, 50f\n"
+      "ld1 { v1.b }[4], [x23]\n"
+      "ld1 { v2.b }[4], [x22]\n"
+      "b 50f\n"
+      "48:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x24, #1, 49f\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "tbz x24, #0, 50f\n"
+      "ld1 { v1.b }[2], [x23]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "b 50f\n"
+      "49:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x23, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "50:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "tbnz %x[flags], #31, 51f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      "51:"  // Height 2: Multiply loop: unique 8: skip row sum
+      "ldr q10, [x28, #0x0]\n"
+      ".inst 0x4e8aa410  // smmla v16.4s, v0.16b, v10.16b\n"
+      "ldr q4, [x28, #0x10]\n"
+      "ldr q5, [x28, #0x20]\n"
+      ".inst 0x4e84a414  // smmla v20.4s, v0.16b, v4.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e85a411  // smmla v17.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      "ldr q8, [x28, #0x50]\n"
+      ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
+      "ldr q9, [x28, #0x60]\n"
+      "ldr q10, [x28, #0x70]\n"
+      ".inst 0x4e87a412  // smmla v18.4s, v0.16b, v7.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e88a416  // smmla v22.4s, v0.16b, v8.16b\n"
+      ".inst 0x4e89a413  // smmla v19.4s, v0.16b, v9.16b\n"
+      ".inst 0x4e8aa417  // smmla v23.4s, v0.16b, v10.16b\n"
+      "52:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 36b\n"
+      "uzp1 v4.2d, v16.2d, v20.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "add x21, x26, x19\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "mov v23.16b, v4.16b\n"
+      "tbnz %x[flags], #31, 53f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1r { v2.4s }, [x22]\n"
+      "dup v12.4s, v11.s[3]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "neg v2.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v2.4s\n"
+      "mul v12.4s, v12.4s, v2.4s\n"
+      "53:"  // Height 2: skip row sum fixup
+      "add v23.4s, v23.4s, v11.4s\n"
+      "ldr q0, [x27, #0x0]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v20.4s, v20.4s, v11.4s\n"
+      "ldr q1, [x27, #0x10]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add v21.4s, v21.4s, v11.4s\n"
+      "ldr q2, [x27, #0x20]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add v22.4s, v22.4s, v11.4s\n"
+      "ldr q3, [x27, #0x30]\n"
+      "add x27, x27, #0x40\n"
+      "add v16.4s, v16.4s, v12.4s\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "add v17.4s, v17.4s, v12.4s\n"
+      "add v18.4s, v18.4s, v12.4s\n"
+      "add v19.4s, v19.4s, v12.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v1.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x23]\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "tbz %x[flags], #5, 54f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v6.16b, v21.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v6.4s\n"
+      "and v7.16b, v22.16b, v0.16b\n"
+      "and v8.16b, v16.16b, v0.16b\n"
+      "and v9.16b, v17.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v22.4s, v22.4s, v7.4s\n"
+      "sqadd v16.4s, v16.4s, v8.4s\n"
+      "sqadd v17.4s, v17.4s, v9.4s\n"
+      "and v10.16b, v18.16b, v0.16b\n"
+      "and v4.16b, v19.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v10.4s\n"
+      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "54:"  // Height 2: no shift correction
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "add x22, %x[qp], %[minval]\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x22]\n"
+      "cmp x9, #0x10\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "uzp1 v23.8h, v23.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "uzp1 v23.16b, v23.16b, v20.16b\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 63f\n"
+      "tbz x9, #3, 58f\n"
+      "str d23, [x26], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "tbz x9, #2, 56f\n"
+      "st1 { v23.s }[2], [x26], #0x4\n"
+      "st1 { v16.s }[2], [x21], #0x4\n"
+      "tbz x9, #1, 55f\n"
+      "st1 { v23.h }[6], [x26], #0x2\n"
+      "st1 { v16.h }[6], [x21], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[14], [x26]\n"
+      "st1 { v16.b }[14], [x21]\n"
+      "b 62f\n"
+      "55:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[12], [x26]\n"
+      "st1 { v16.b }[12], [x21]\n"
+      "b 62f\n"
+      "56:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 57f\n"
+      "st1 { v23.h }[4], [x26], #0x2\n"
+      "st1 { v16.h }[4], [x21], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[10], [x26]\n"
+      "st1 { v16.b }[10], [x21]\n"
+      "b 62f\n"
+      "57:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[8], [x26]\n"
+      "st1 { v16.b }[8], [x21]\n"
+      "b 62f\n"
+      "58:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 60f\n"
+      "str s23, [x26], #0x4\n"
+      "str s16, [x21], #0x4\n"
+      "tbz x9, #1, 59f\n"
+      "st1 { v23.h }[2], [x26], #0x2\n"
+      "st1 { v16.h }[2], [x21], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[6], [x26]\n"
+      "st1 { v16.b }[6], [x21]\n"
+      "b 62f\n"
+      "59:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[4], [x26]\n"
+      "st1 { v16.b }[4], [x21]\n"
+      "b 62f\n"
+      "60:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 61f\n"
+      "str h23, [x26], #0x2\n"
+      "str h16, [x21], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[2], [x26]\n"
+      "st1 { v16.b }[2], [x21]\n"
+      "b 62f\n"
+      "61:"  // Height 2: Partial direct writeback: partial_1_0
+      "str b23, [x26, #0x0]\n"
+      "str b16, [x21, #0x0]\n"
+      "62:"  // Height 2: Partial direct writeback: Done
+      "b 64f\n"
+      "63:"  // Height 2: Full writeback
+      "str q23, [x26, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "str q16, [x21, #0x0]\n"
+      "64:"  // Height 2: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 34b\n"
+      "b 130f\n"
+      "65:"  // Height 3
+      "movi v11.4s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "mov x26, %x[output_ptr]\n"
+      "movi v15.16b, #0x1\n"
+      "66:"  // Height 3: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "67:"  // Height 3: setup done
+      "mov x25, #0x0\n"
+      "68:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 69f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "cbnz x25, 70f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "b 70f\n"
+      "69:"  // Height 3: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "70:"  // Height 3: input setup done
+      "cmp x24, #0x10\n"
+      "blt 75f\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "cmp x24, #0x20\n"
+      "blt 73f\n"
+      "71:"  // Height 3: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x21, #0x0]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q5, [x28, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x28, #0x20]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "ldr q8, [x28, #0x30]\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45c  // smmla v28.4s, v2.16b, v6.16b\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x4e88a45d  // smmla v29.4s, v2.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      ".inst 0x4e89a45a  // smmla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x4e8aa45e  // smmla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45b  // smmla v27.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45f  // smmla v31.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a478  // smmla v24.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a47c  // smmla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e88a479  // smmla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x4e89a47d  // smmla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x4e8aa47a  // smmla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a47e  // smmla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e85a47b  // smmla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a47f  // smmla v31.4s, v3.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 72f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f946d  // sdot v13.4s, v3.16b, v15.16b\n"
+      "72:"  // Height 3: Multiply loop: unique 9: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "sub x24, x24, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "cmp x24, #0x20\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "bge 71b\n"
+      "73:"  // Height 3: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x21, #0x0]\n"
+      "sub x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q5, [x28, #0x0]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x28, #0x20]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "ldr q8, [x28, #0x30]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45c  // smmla v28.4s, v2.16b, v6.16b\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x4e88a45d  // smmla v29.4s, v2.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      ".inst 0x4e89a45a  // smmla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x4e8aa45e  // smmla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45b  // smmla v27.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45f  // smmla v31.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a478  // smmla v24.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a47c  // smmla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e88a479  // smmla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x4e89a47d  // smmla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x4e8aa47a  // smmla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a47e  // smmla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e85a47b  // smmla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a47f  // smmla v31.4s, v3.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 74f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f946d  // sdot v13.4s, v3.16b, v15.16b\n"
+      "74:"  // Height 3: Multiply loop: unique 10: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "75:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x24, 84f\n"
+      "cmp x24, #0x8\n"
+      "blt 78f\n"
+      "76:"  // Height 3: Multiply loop: Odd block loop
+      "movi v7.16b, #0x0\n"
+      "ldr d1, [x23], #0x8\n"
+      "ldr d2, [x22], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x21], #0x8\n"
+      "trn1 v2.2d, v3.2d, v7.2d\n"
+      "tbnz %x[flags], #31, 77f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "77:"  // Height 3: Multiply loop: unique 11: skip row sum
+      "ldr q8, [x28, #0x0]\n"
+      ".inst 0x4e88a410  // smmla v16.4s, v0.16b, v8.16b\n"
+      "ldr q9, [x28, #0x10]\n"
+      "sub x24, x24, #0x8\n"
+      ".inst 0x4e88a458  // smmla v24.4s, v2.16b, v8.16b\n"
+      "ldr q10, [x28, #0x20]\n"
+      "cmp x24, #0x8\n"
+      ".inst 0x4e89a414  // smmla v20.4s, v0.16b, v9.16b\n"
+      "ldr q4, [x28, #0x30]\n"
+      ".inst 0x4e89a45c  // smmla v28.4s, v2.16b, v9.16b\n"
+      "ldr q5, [x28, #0x40]\n"
+      ".inst 0x4e8aa411  // smmla v17.4s, v0.16b, v10.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e8aa459  // smmla v25.4s, v2.16b, v10.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      "ldr q8, [x28, #0x70]\n"
+      ".inst 0x4e84a415  // smmla v21.4s, v0.16b, v4.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e84a45d  // smmla v29.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45a  // smmla v26.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e86a416  // smmla v22.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45e  // smmla v30.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a413  // smmla v19.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a45b  // smmla v27.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e88a417  // smmla v23.4s, v0.16b, v8.16b\n"
+      ".inst 0x4e88a45f  // smmla v31.4s, v2.16b, v8.16b\n"
+      "bge 76b\n"
+      "cbz x24, 84f\n"
+      "78:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x24, #2, 80f\n"
+      "ldr s1, [x23], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "ldr s3, [x21], #0x4\n"
+      "tbz x24, #1, 79f\n"
+      "ld1 { v1.h }[2], [x23], #0x2\n"
+      "ld1 { v2.h }[2], [x22], #0x2\n"
+      "ld1 { v3.h }[2], [x21], #0x2\n"
+      "tbz x24, #0, 82f\n"
+      "ld1 { v1.b }[6], [x23]\n"
+      "ld1 { v2.b }[6], [x22]\n"
+      "ld1 { v3.b }[6], [x21]\n"
+      "b 82f\n"
+      "79:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x24, #0, 82f\n"
+      "ld1 { v1.b }[4], [x23]\n"
+      "ld1 { v2.b }[4], [x22]\n"
+      "ld1 { v3.b }[4], [x21]\n"
+      "b 82f\n"
+      "80:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x24, #1, 81f\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "ldr h3, [x21], #0x2\n"
+      "tbz x24, #0, 82f\n"
+      "ld1 { v1.b }[2], [x23]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "ld1 { v3.b }[2], [x21]\n"
+      "b 82f\n"
+      "81:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x23, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "ldr b3, [x21, #0x0]\n"
+      "82:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "movi v9.16b, #0x0\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v2.2d, v3.2d, v9.2d\n"
+      "tbnz %x[flags], #31, 83f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "83:"  // Height 3: Multiply loop: unique 12: skip row sum
+      "ldr q10, [x28, #0x0]\n"
+      ".inst 0x4e8aa410  // smmla v16.4s, v0.16b, v10.16b\n"
+      "ldr q4, [x28, #0x10]\n"
+      ".inst 0x4e8aa458  // smmla v24.4s, v2.16b, v10.16b\n"
+      "ldr q5, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e84a414  // smmla v20.4s, v0.16b, v4.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e84a45c  // smmla v28.4s, v2.16b, v4.16b\n"
+      "ldr q8, [x28, #0x50]\n"
+      ".inst 0x4e85a411  // smmla v17.4s, v0.16b, v5.16b\n"
+      "ldr q9, [x28, #0x60]\n"
+      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
+      "ldr q10, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45d  // smmla v29.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a412  // smmla v18.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a45a  // smmla v26.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e88a416  // smmla v22.4s, v0.16b, v8.16b\n"
+      ".inst 0x4e88a45e  // smmla v30.4s, v2.16b, v8.16b\n"
+      ".inst 0x4e89a413  // smmla v19.4s, v0.16b, v9.16b\n"
+      ".inst 0x4e89a45b  // smmla v27.4s, v2.16b, v9.16b\n"
+      ".inst 0x4e8aa417  // smmla v23.4s, v0.16b, v10.16b\n"
+      ".inst 0x4e8aa45f  // smmla v31.4s, v2.16b, v10.16b\n"
+      "84:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 68b\n"
+      "uzp1 v4.2d, v16.2d, v20.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "add x21, x26, x19\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "add x20, x21, x19\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "mov v31.16b, v4.16b\n"
+      "tbnz %x[flags], #31, 85f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1r { v3.4s }, [x22]\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "dup v12.4s, v11.s[3]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "neg v3.4s, v3.4s\n"
+      "dup v13.4s, v13.s[0]\n"
+      "mul v11.4s, v11.4s, v3.4s\n"
+      "mul v12.4s, v12.4s, v3.4s\n"
+      "mul v13.4s, v13.4s, v3.4s\n"
+      "85:"  // Height 3: skip row sum fixup
+      "add v31.4s, v31.4s, v11.4s\n"
+      "ldr q0, [x27, #0x0]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v20.4s, v20.4s, v11.4s\n"
+      "ldr q1, [x27, #0x10]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add v21.4s, v21.4s, v11.4s\n"
+      "ldr q2, [x27, #0x20]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add v22.4s, v22.4s, v11.4s\n"
+      "ldr q3, [x27, #0x30]\n"
+      "add x27, x27, #0x40\n"
+      "add v16.4s, v16.4s, v12.4s\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "add v17.4s, v17.4s, v12.4s\n"
+      "add v18.4s, v18.4s, v12.4s\n"
+      "add v19.4s, v19.4s, v12.4s\n"
+      "add v24.4s, v24.4s, v13.4s\n"
+      "add v25.4s, v25.4s, v13.4s\n"
+      "add v26.4s, v26.4s, v13.4s\n"
+      "add v27.4s, v27.4s, v13.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v1.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x23]\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "tbz %x[flags], #5, 86f\n"
+      "and v4.16b, v31.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v6.16b, v21.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v4.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v6.4s\n"
+      "and v7.16b, v22.16b, v0.16b\n"
+      "and v8.16b, v16.16b, v0.16b\n"
+      "and v9.16b, v17.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v22.4s, v22.4s, v7.4s\n"
+      "sqadd v16.4s, v16.4s, v8.4s\n"
+      "sqadd v17.4s, v17.4s, v9.4s\n"
+      "and v10.16b, v18.16b, v0.16b\n"
+      "and v4.16b, v19.16b, v0.16b\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v10.4s\n"
+      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
+      "and v7.16b, v26.16b, v0.16b\n"
+      "and v8.16b, v27.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sqadd v26.4s, v26.4s, v7.4s\n"
+      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "86:"  // Height 3: no shift correction
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "add x22, %x[qp], %[minval]\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x22]\n"
+      "cmp x9, #0x10\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v31.4s, v31.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v31.4s, v31.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "uzp1 v31.8h, v31.8h, v20.8h\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "bge 95f\n"
+      "tbz x9, #3, 90f\n"
+      "str d31, [x26], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "str d24, [x20], #0x8\n"
+      "tbz x9, #2, 88f\n"
+      "st1 { v31.s }[2], [x26], #0x4\n"
+      "st1 { v16.s }[2], [x21], #0x4\n"
+      "st1 { v24.s }[2], [x20], #0x4\n"
+      "tbz x9, #1, 87f\n"
+      "st1 { v31.h }[6], [x26], #0x2\n"
+      "st1 { v16.h }[6], [x21], #0x2\n"
+      "st1 { v24.h }[6], [x20], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[14], [x26]\n"
+      "st1 { v16.b }[14], [x21]\n"
+      "st1 { v24.b }[14], [x20]\n"
+      "b 94f\n"
+      "87:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[12], [x26]\n"
+      "st1 { v16.b }[12], [x21]\n"
+      "st1 { v24.b }[12], [x20]\n"
+      "b 94f\n"
+      "88:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 89f\n"
+      "st1 { v31.h }[4], [x26], #0x2\n"
+      "st1 { v16.h }[4], [x21], #0x2\n"
+      "st1 { v24.h }[4], [x20], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[10], [x26]\n"
+      "st1 { v16.b }[10], [x21]\n"
+      "st1 { v24.b }[10], [x20]\n"
+      "b 94f\n"
+      "89:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[8], [x26]\n"
+      "st1 { v16.b }[8], [x21]\n"
+      "st1 { v24.b }[8], [x20]\n"
+      "b 94f\n"
+      "90:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 92f\n"
+      "str s31, [x26], #0x4\n"
+      "str s16, [x21], #0x4\n"
+      "str s24, [x20], #0x4\n"
+      "tbz x9, #1, 91f\n"
+      "st1 { v31.h }[2], [x26], #0x2\n"
+      "st1 { v16.h }[2], [x21], #0x2\n"
+      "st1 { v24.h }[2], [x20], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[6], [x26]\n"
+      "st1 { v16.b }[6], [x21]\n"
+      "st1 { v24.b }[6], [x20]\n"
+      "b 94f\n"
+      "91:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[4], [x26]\n"
+      "st1 { v16.b }[4], [x21]\n"
+      "st1 { v24.b }[4], [x20]\n"
+      "b 94f\n"
+      "92:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 93f\n"
+      "str h31, [x26], #0x2\n"
+      "str h16, [x21], #0x2\n"
+      "str h24, [x20], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[2], [x26]\n"
+      "st1 { v16.b }[2], [x21]\n"
+      "st1 { v24.b }[2], [x20]\n"
+      "b 94f\n"
+      "93:"  // Height 3: Partial direct writeback: partial_1_0
+      "str b31, [x26, #0x0]\n"
+      "str b16, [x21, #0x0]\n"
+      "str b24, [x20, #0x0]\n"
+      "94:"  // Height 3: Partial direct writeback: Done
+      "b 96f\n"
+      "95:"  // Height 3: Full writeback
+      "str q31, [x26, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "str q16, [x21, #0x0]\n"
+      "str q24, [x20, #0x0]\n"
+      "96:"  // Height 3: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 66b\n"
+      "b 130f\n"
+      "97:"  // Height 4
+      "movi v11.4s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "movi v14.4s, #0x0\n"
+      "mov x19, #0x4\n"
+      "movi v15.16b, #0x1\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "98:"  // Height 4: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "99:"  // Height 4: setup done
+      "mov x25, #0x0\n"
+      "100:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 101f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x25, 102f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "add x20, x20, x19\n"
+      "b 102f\n"
+      "101:"  // Height 4: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "add x20, x21, x19\n"
+      "102:"  // Height 4: input setup done
+      "cmp x24, #0x10\n"
+      "blt 107f\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "cmp x24, #0x20\n"
+      "blt 105f\n"
+      "103:"  // Height 4: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x21, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x20, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x28, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      "ldr q8, [x28, #0x30]\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      "ldr q9, [x28, #0x40]\n"
+      ".inst 0x4e86a45c  // smmla v28.4s, v2.16b, v6.16b\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x4e88a45d  // smmla v29.4s, v2.16b, v8.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x4e89a45a  // smmla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x4e8aa45e  // smmla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45b  // smmla v27.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45f  // smmla v31.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a478  // smmla v24.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a47c  // smmla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e88a479  // smmla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x4e89a47d  // smmla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x4e8aa47a  // smmla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a47e  // smmla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e85a47b  // smmla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a47f  // smmla v31.4s, v3.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 104f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f946d  // sdot v13.4s, v3.16b, v15.16b\n"
+      "104:"  // Height 4: Multiply loop: unique 13: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "sub x24, x24, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "cmp x24, #0x20\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "bge 103b\n"
+      "105:"  // Height 4: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x21, #0x0]\n"
+      "sub x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x20, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x28, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x4e85a410  // smmla v16.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      "ldr q8, [x28, #0x30]\n"
+      ".inst 0x4e86a414  // smmla v20.4s, v0.16b, v6.16b\n"
+      "ldr q9, [x28, #0x40]\n"
+      ".inst 0x4e86a45c  // smmla v28.4s, v2.16b, v6.16b\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x4e87a411  // smmla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x4e88a415  // smmla v21.4s, v0.16b, v8.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x4e88a45d  // smmla v29.4s, v2.16b, v8.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x4e89a412  // smmla v18.4s, v0.16b, v9.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x4e89a45a  // smmla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x4e8aa416  // smmla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x4e8aa45e  // smmla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x4e84a413  // smmla v19.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a45b  // smmla v27.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x4e85a417  // smmla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45f  // smmla v31.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a478  // smmla v24.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a434  // smmla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a47c  // smmla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e88a431  // smmla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x4e88a479  // smmla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x4e89a435  // smmla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x4e89a47d  // smmla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x4e8aa432  // smmla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x4e8aa47a  // smmla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x4e84a436  // smmla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a47e  // smmla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e85a47b  // smmla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e86a437  // smmla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a47f  // smmla v31.4s, v3.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 106f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x4e8f942b  // sdot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x4e8f946d  // sdot v13.4s, v3.16b, v15.16b\n"
+      "106:"  // Height 4: Multiply loop: unique 14: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "107:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x24, 116f\n"
+      "cmp x24, #0x8\n"
+      "blt 110f\n"
+      "108:"  // Height 4: Multiply loop: Odd block loop
+      "ldr d1, [x23], #0x8\n"
+      "ldr d2, [x22], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x21], #0x8\n"
+      "ldr d7, [x20], #0x8\n"
+      "trn1 v2.2d, v3.2d, v7.2d\n"
+      "tbnz %x[flags], #31, 109f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "109:"  // Height 4: Multiply loop: unique 15: skip row sum
+      "ldr q8, [x28, #0x0]\n"
+      ".inst 0x4e88a410  // smmla v16.4s, v0.16b, v8.16b\n"
+      "ldr q9, [x28, #0x10]\n"
+      "sub x24, x24, #0x8\n"
+      ".inst 0x4e88a458  // smmla v24.4s, v2.16b, v8.16b\n"
+      "ldr q10, [x28, #0x20]\n"
+      "cmp x24, #0x8\n"
+      ".inst 0x4e89a414  // smmla v20.4s, v0.16b, v9.16b\n"
+      "ldr q4, [x28, #0x30]\n"
+      ".inst 0x4e89a45c  // smmla v28.4s, v2.16b, v9.16b\n"
+      "ldr q5, [x28, #0x40]\n"
+      ".inst 0x4e8aa411  // smmla v17.4s, v0.16b, v10.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e8aa459  // smmla v25.4s, v2.16b, v10.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      "ldr q8, [x28, #0x70]\n"
+      ".inst 0x4e84a415  // smmla v21.4s, v0.16b, v4.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e84a45d  // smmla v29.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a412  // smmla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a45a  // smmla v26.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e86a416  // smmla v22.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45e  // smmla v30.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a413  // smmla v19.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a45b  // smmla v27.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e88a417  // smmla v23.4s, v0.16b, v8.16b\n"
+      ".inst 0x4e88a45f  // smmla v31.4s, v2.16b, v8.16b\n"
+      "bge 108b\n"
+      "cbz x24, 116f\n"
+      "110:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x24, #2, 112f\n"
+      "ldr s1, [x23], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "ldr s3, [x21], #0x4\n"
+      "ldr s9, [x20], #0x4\n"
+      "tbz x24, #1, 111f\n"
+      "ld1 { v1.h }[2], [x23], #0x2\n"
+      "ld1 { v2.h }[2], [x22], #0x2\n"
+      "ld1 { v3.h }[2], [x21], #0x2\n"
+      "ld1 { v9.h }[2], [x20], #0x2\n"
+      "tbz x24, #0, 114f\n"
+      "ld1 { v1.b }[6], [x23]\n"
+      "ld1 { v2.b }[6], [x22]\n"
+      "ld1 { v3.b }[6], [x21]\n"
+      "ld1 { v9.b }[6], [x20]\n"
+      "b 114f\n"
+      "111:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x24, #0, 114f\n"
+      "ld1 { v1.b }[4], [x23]\n"
+      "ld1 { v2.b }[4], [x22]\n"
+      "ld1 { v3.b }[4], [x21]\n"
+      "ld1 { v9.b }[4], [x20]\n"
+      "b 114f\n"
+      "112:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x24, #1, 113f\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "ldr h3, [x21], #0x2\n"
+      "ldr h9, [x20], #0x2\n"
+      "tbz x24, #0, 114f\n"
+      "ld1 { v1.b }[2], [x23]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "ld1 { v3.b }[2], [x21]\n"
+      "ld1 { v9.b }[2], [x20]\n"
+      "b 114f\n"
+      "113:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x23, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "ldr b3, [x21, #0x0]\n"
+      "ldr b9, [x20, #0x0]\n"
+      "114:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v2.2d, v3.2d, v9.2d\n"
+      "tbnz %x[flags], #31, 115f\n"
+      ".inst 0x4e8f940b  // sdot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x4e8f944d  // sdot v13.4s, v2.16b, v15.16b\n"
+      "115:"  // Height 4: Multiply loop: unique 16: skip row sum
+      "ldr q10, [x28, #0x0]\n"
+      ".inst 0x4e8aa410  // smmla v16.4s, v0.16b, v10.16b\n"
+      "ldr q4, [x28, #0x10]\n"
+      ".inst 0x4e8aa458  // smmla v24.4s, v2.16b, v10.16b\n"
+      "ldr q5, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e84a414  // smmla v20.4s, v0.16b, v4.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e84a45c  // smmla v28.4s, v2.16b, v4.16b\n"
+      "ldr q8, [x28, #0x50]\n"
+      ".inst 0x4e85a411  // smmla v17.4s, v0.16b, v5.16b\n"
+      "ldr q9, [x28, #0x60]\n"
+      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
+      "ldr q10, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e86a415  // smmla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a45d  // smmla v29.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a412  // smmla v18.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a45a  // smmla v26.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e88a416  // smmla v22.4s, v0.16b, v8.16b\n"
+      ".inst 0x4e88a45e  // smmla v30.4s, v2.16b, v8.16b\n"
+      ".inst 0x4e89a413  // smmla v19.4s, v0.16b, v9.16b\n"
+      ".inst 0x4e89a45b  // smmla v27.4s, v2.16b, v9.16b\n"
+      ".inst 0x4e8aa417  // smmla v23.4s, v0.16b, v10.16b\n"
+      ".inst 0x4e8aa45f  // smmla v31.4s, v2.16b, v10.16b\n"
+      "116:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 100b\n"
+      "uzp1 v4.2d, v16.2d, v20.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "add x21, x26, x19\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "add x20, x21, x19\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "add x19, x20, x19\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x19, #0x0]\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "mov v31.16b, v4.16b\n"
+      "tbnz %x[flags], #31, 117f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "dup v12.4s, v11.s[3]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "neg v4.4s, v4.4s\n"
+      "dup v14.4s, v13.s[3]\n"
+      "dup v13.4s, v13.s[0]\n"
+      "mul v11.4s, v11.4s, v4.4s\n"
+      "mul v12.4s, v12.4s, v4.4s\n"
+      "mul v13.4s, v13.4s, v4.4s\n"
+      "mul v14.4s, v14.4s, v4.4s\n"
+      "117:"  // Height 4: skip row sum fixup
+      "add v31.4s, v31.4s, v11.4s\n"
+      "ldr q0, [x27, #0x0]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v20.4s, v20.4s, v11.4s\n"
+      "ldr q1, [x27, #0x10]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add v21.4s, v21.4s, v11.4s\n"
+      "ldr q2, [x27, #0x20]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add v22.4s, v22.4s, v11.4s\n"
+      "ldr q3, [x27, #0x30]\n"
+      "add x27, x27, #0x40\n"
+      "add v16.4s, v16.4s, v12.4s\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "add v17.4s, v17.4s, v12.4s\n"
+      "add v18.4s, v18.4s, v12.4s\n"
+      "add v19.4s, v19.4s, v12.4s\n"
+      "add v23.4s, v23.4s, v13.4s\n"
+      "add v28.4s, v28.4s, v13.4s\n"
+      "add v29.4s, v29.4s, v13.4s\n"
+      "add v30.4s, v30.4s, v13.4s\n"
+      "add v24.4s, v24.4s, v14.4s\n"
+      "add v25.4s, v25.4s, v14.4s\n"
+      "add v26.4s, v26.4s, v14.4s\n"
+      "add v27.4s, v27.4s, v14.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v1.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v1.4s\n"
+      "add v29.4s, v29.4s, v2.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x23]\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "tbz %x[flags], #5, 118f\n"
+      "and v4.16b, v31.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v6.16b, v21.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v4.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v6.4s\n"
+      "and v7.16b, v22.16b, v0.16b\n"
+      "and v8.16b, v16.16b, v0.16b\n"
+      "and v9.16b, v17.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v22.4s, v22.4s, v7.4s\n"
+      "sqadd v16.4s, v16.4s, v8.4s\n"
+      "sqadd v17.4s, v17.4s, v9.4s\n"
+      "and v10.16b, v18.16b, v0.16b\n"
+      "and v4.16b, v19.16b, v0.16b\n"
+      "and v5.16b, v23.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v10.4s\n"
+      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "sqadd v23.4s, v23.4s, v5.4s\n"
+      "and v6.16b, v28.16b, v0.16b\n"
+      "and v7.16b, v29.16b, v0.16b\n"
+      "and v8.16b, v30.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v6.4s\n"
+      "sqadd v29.4s, v29.4s, v7.4s\n"
+      "sqadd v30.4s, v30.4s, v8.4s\n"
+      "and v9.16b, v24.16b, v0.16b\n"
+      "and v10.16b, v25.16b, v0.16b\n"
+      "and v4.16b, v26.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v9.4s\n"
+      "sqadd v25.4s, v25.4s, v10.4s\n"
+      "sqadd v26.4s, v26.4s, v4.4s\n"
+      "and v5.16b, v27.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v27.4s, v27.4s, v5.4s\n"
+      "118:"  // Height 4: no shift correction
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "add x22, %x[qp], %[minval]\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x22]\n"
+      "cmp x9, #0x10\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v31.4s, v31.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v31.4s, v31.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v28.4s, v28.4s, v0.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
+      "srshl v29.4s, v29.4s, v0.4s\n"
+      "srshl v30.4s, v30.4s, v0.4s\n"
+      "smin v28.4s, v28.4s, v6.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "smax v28.4s, v28.4s, v5.4s\n"
+      "add v30.4s, v30.4s, v4.4s\n"
+      "smin v29.4s, v29.4s, v6.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "smin v30.4s, v30.4s, v6.4s\n"
+      "smax v29.4s, v29.4s, v5.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smax v30.4s, v30.4s, v5.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "uzp1 v31.8h, v31.8h, v20.8h\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.8h, v23.8h, v28.8h\n"
+      "uzp1 v28.8h, v29.8h, v30.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v23.16b, v23.16b, v28.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "bge 127f\n"
+      "tbz x9, #3, 122f\n"
+      "str d31, [x26], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "str d23, [x20], #0x8\n"
+      "str d24, [x19], #0x8\n"
+      "tbz x9, #2, 120f\n"
+      "st1 { v31.s }[2], [x26], #0x4\n"
+      "st1 { v16.s }[2], [x21], #0x4\n"
+      "st1 { v23.s }[2], [x20], #0x4\n"
+      "st1 { v24.s }[2], [x19], #0x4\n"
+      "tbz x9, #1, 119f\n"
+      "st1 { v31.h }[6], [x26], #0x2\n"
+      "st1 { v16.h }[6], [x21], #0x2\n"
+      "st1 { v23.h }[6], [x20], #0x2\n"
+      "st1 { v24.h }[6], [x19], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[14], [x26]\n"
+      "st1 { v16.b }[14], [x21]\n"
+      "st1 { v23.b }[14], [x20]\n"
+      "st1 { v24.b }[14], [x19]\n"
+      "b 126f\n"
+      "119:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[12], [x26]\n"
+      "st1 { v16.b }[12], [x21]\n"
+      "st1 { v23.b }[12], [x20]\n"
+      "st1 { v24.b }[12], [x19]\n"
+      "b 126f\n"
+      "120:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 121f\n"
+      "st1 { v31.h }[4], [x26], #0x2\n"
+      "st1 { v16.h }[4], [x21], #0x2\n"
+      "st1 { v23.h }[4], [x20], #0x2\n"
+      "st1 { v24.h }[4], [x19], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[10], [x26]\n"
+      "st1 { v16.b }[10], [x21]\n"
+      "st1 { v23.b }[10], [x20]\n"
+      "st1 { v24.b }[10], [x19]\n"
+      "b 126f\n"
+      "121:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[8], [x26]\n"
+      "st1 { v16.b }[8], [x21]\n"
+      "st1 { v23.b }[8], [x20]\n"
+      "st1 { v24.b }[8], [x19]\n"
+      "b 126f\n"
+      "122:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 124f\n"
+      "str s31, [x26], #0x4\n"
+      "str s16, [x21], #0x4\n"
+      "str s23, [x20], #0x4\n"
+      "str s24, [x19], #0x4\n"
+      "tbz x9, #1, 123f\n"
+      "st1 { v31.h }[2], [x26], #0x2\n"
+      "st1 { v16.h }[2], [x21], #0x2\n"
+      "st1 { v23.h }[2], [x20], #0x2\n"
+      "st1 { v24.h }[2], [x19], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[6], [x26]\n"
+      "st1 { v16.b }[6], [x21]\n"
+      "st1 { v23.b }[6], [x20]\n"
+      "st1 { v24.b }[6], [x19]\n"
+      "b 126f\n"
+      "123:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[4], [x26]\n"
+      "st1 { v16.b }[4], [x21]\n"
+      "st1 { v23.b }[4], [x20]\n"
+      "st1 { v24.b }[4], [x19]\n"
+      "b 126f\n"
+      "124:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 125f\n"
+      "str h31, [x26], #0x2\n"
+      "str h16, [x21], #0x2\n"
+      "str h23, [x20], #0x2\n"
+      "str h24, [x19], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[2], [x26]\n"
+      "st1 { v16.b }[2], [x21]\n"
+      "st1 { v23.b }[2], [x20]\n"
+      "st1 { v24.b }[2], [x19]\n"
+      "b 126f\n"
+      "125:"  // Height 4: Partial direct writeback: partial_1_0
+      "str b31, [x26, #0x0]\n"
+      "str b16, [x21, #0x0]\n"
+      "str b23, [x20, #0x0]\n"
+      "str b24, [x19, #0x0]\n"
+      "126:"  // Height 4: Partial direct writeback: Done
+      "b 128f\n"
+      "127:"  // Height 4: Full writeback
+      "str q31, [x26, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "str q16, [x21, #0x0]\n"
+      "str q23, [x20, #0x0]\n"
+      "str q24, [x19, #0x0]\n"
+      "128:"  // Height 4: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 98b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 130f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 129f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "129:"  // Update direct input
+      "mov x19, #0x4\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "130:"  // Exit
+
+      : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
index eb5bdfe55c..b028a8a9a3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
@@ -22,8 +22,8 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef __aarch64__
 
+#ifdef __aarch64__
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -44,7 +44,8 @@ void a64_hybrid_s8qs_dot_6x16_a55( ARGLIST );
 class cls_a64_hybrid_s8qs_dot_6x16
 {
 public:
-    typedef int8_t operand_type;
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
     typedef int8_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,16 +71,24 @@ public:
         return false;
     }
 
-    StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 8.28 };
-            default:
-                return { 27.5482 };
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 7.5301 };
+                case CPUModel::A510:
+                    return { 15.71 };
+                default:
+                    return { 27.5482 };
+                case CPUModel::V1:
+                    return { 52.09 };
+            }
         }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
@@ -99,4 +108,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
index 6e3a00ed72..ba8a2ccb1d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp
@@ -309,8 +309,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "ld1r { v0.4s }, [x25]\n"
       "ld1r { v4.4s }, [x24]\n"
       "mov v1.16b, v0.16b\n"
-      "mov v2.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
       "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
@@ -693,8 +693,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "ld1r { v0.4s }, [x25]\n"
       "ld1r { v4.4s }, [x24]\n"
       "mov v1.16b, v0.16b\n"
-      "mov v2.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
       "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
@@ -1193,8 +1193,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "ld1r { v0.4s }, [x25]\n"
       "ld1r { v4.4s }, [x24]\n"
       "mov v1.16b, v0.16b\n"
-      "mov v2.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
       "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
@@ -1809,8 +1809,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "ld1r { v0.4s }, [x25]\n"
       "ld1r { v4.4s }, [x24]\n"
       "mov v1.16b, v0.16b\n"
-      "mov v2.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
       "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
@@ -2541,8 +2541,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "ld1r { v0.4s }, [x25]\n"
       "ld1r { v4.4s }, [x24]\n"
       "mov v1.16b, v0.16b\n"
-      "mov v2.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
       "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
@@ -3392,8 +3392,8 @@ void a64_hybrid_s8qs_dot_6x16_a55 (
       "ld1r { v0.4s }, [x25]\n"
       "ld1r { v4.4s }, [x24]\n"
       "mov v1.16b, v0.16b\n"
-      "mov v2.16b, v0.16b\n"
       "mov v5.16b, v4.16b\n"
+      "mov v2.16b, v0.16b\n"
       "mov v6.16b, v4.16b\n"
       "mov v3.16b, v0.16b\n"
       "mov v7.16b, v4.16b\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
index 5a4df161aa..f503f40b0c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
@@ -287,16 +287,16 @@ void a64_hybrid_s8qs_dot_6x16 (
       "sqrdmulh v11.4s, v11.4s, v7.4s\n"
       "tbz %x[flags], #5, 17f\n"
       "and v4.16b, v8.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v9.16b, v1.16b\n"
       "and v6.16b, v10.16b, v2.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v9.4s, v9.4s, v5.4s\n"
       "sqadd v10.4s, v10.4s, v6.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v11.4s, v11.4s, v7.4s\n"
       "17:"  // Height 1: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
@@ -639,27 +639,27 @@ void a64_hybrid_s8qs_dot_6x16 (
       "sqrdmulh v15.4s, v15.4s, v7.4s\n"
       "tbz %x[flags], #5, 44f\n"
       "and v4.16b, v8.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v9.16b, v1.16b\n"
       "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v11.16b, v3.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v4.16b, v12.16b, v0.16b\n"
       "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "and v4.16b, v12.16b, v0.16b\n"
       "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v12.4s, v12.4s, v4.4s\n"
+      "sqadd v13.4s, v13.4s, v5.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
       "and v7.16b, v15.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v13.4s, v13.4s, v5.4s\n"
       "sqadd v14.4s, v14.4s, v6.4s\n"
       "sqadd v15.4s, v15.4s, v7.4s\n"
       "44:"  // Height 2: no shift correction
@@ -676,8 +676,6 @@ void a64_hybrid_s8qs_dot_6x16 (
       "cmp x10, #0x10\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
-      "srshl v14.4s, v14.4s, v2.4s\n"
-      "srshl v15.4s, v15.4s, v3.4s\n"
       "add v8.4s, v8.4s, v4.4s\n"
       "add v9.4s, v9.4s, v4.4s\n"
       "add v10.4s, v10.4s, v4.4s\n"
@@ -696,16 +694,18 @@ void a64_hybrid_s8qs_dot_6x16 (
       "smax v11.4s, v11.4s, v5.4s\n"
       "smax v12.4s, v12.4s, v5.4s\n"
       "smax v13.4s, v13.4s, v5.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
       "add v14.4s, v14.4s, v4.4s\n"
       "add v15.4s, v15.4s, v4.4s\n"
-      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "uzp1 v12.8h, v12.8h, v13.8h\n"
       "smin v14.4s, v14.4s, v6.4s\n"
       "smin v15.4s, v15.4s, v6.4s\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
       "smax v14.4s, v14.4s, v5.4s\n"
       "smax v15.4s, v15.4s, v5.4s\n"
-      "uzp1 v12.8h, v12.8h, v13.8h\n"
-      "uzp1 v8.16b, v8.16b, v9.16b\n"
       "uzp1 v13.8h, v14.8h, v15.8h\n"
       "uzp1 v12.16b, v12.16b, v13.16b\n"
       "bge 53f\n"
@@ -1105,37 +1105,37 @@ void a64_hybrid_s8qs_dot_6x16 (
       "sqrdmulh v19.4s, v19.4s, v7.4s\n"
       "tbz %x[flags], #5, 71f\n"
       "and v4.16b, v8.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v9.16b, v1.16b\n"
       "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v11.16b, v3.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v4.16b, v12.16b, v0.16b\n"
       "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "and v4.16b, v12.16b, v0.16b\n"
       "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v12.4s, v12.4s, v4.4s\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v13.4s, v13.4s, v5.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "and v7.16b, v15.16b, v3.16b\n"
       "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v14.4s, v14.4s, v6.4s\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "and v6.16b, v18.16b, v2.16b\n"
       "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v17.4s, v17.4s, v5.4s\n"
       "sqadd v18.4s, v18.4s, v6.4s\n"
@@ -1154,8 +1154,6 @@ void a64_hybrid_s8qs_dot_6x16 (
       "cmp x10, #0x10\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
-      "srshl v14.4s, v14.4s, v2.4s\n"
-      "srshl v15.4s, v15.4s, v3.4s\n"
       "add v8.4s, v8.4s, v4.4s\n"
       "add v9.4s, v9.4s, v4.4s\n"
       "add v10.4s, v10.4s, v4.4s\n"
@@ -1174,31 +1172,33 @@ void a64_hybrid_s8qs_dot_6x16 (
       "smax v11.4s, v11.4s, v5.4s\n"
       "smax v12.4s, v12.4s, v5.4s\n"
       "smax v13.4s, v13.4s, v5.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
       "add v14.4s, v14.4s, v4.4s\n"
       "add v15.4s, v15.4s, v4.4s\n"
-      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
       "smin v14.4s, v14.4s, v6.4s\n"
       "smin v15.4s, v15.4s, v6.4s\n"
-      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
       "smax v14.4s, v14.4s, v5.4s\n"
       "smax v15.4s, v15.4s, v5.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
       "add v17.4s, v17.4s, v4.4s\n"
       "srshl v18.4s, v18.4s, v2.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
       "srshl v19.4s, v19.4s, v3.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
       "add v18.4s, v18.4s, v4.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
       "add v19.4s, v19.4s, v4.4s\n"
-      "uzp1 v8.8h, v8.8h, v9.8h\n"
       "smin v18.4s, v18.4s, v6.4s\n"
-      "smin v19.4s, v19.4s, v6.4s\n"
       "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
       "smax v18.4s, v18.4s, v5.4s\n"
-      "smax v19.4s, v19.4s, v5.4s\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
       "uzp1 v13.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
@@ -1685,52 +1685,52 @@ void a64_hybrid_s8qs_dot_6x16 (
       "sqrdmulh v23.4s, v23.4s, v7.4s\n"
       "tbz %x[flags], #5, 98f\n"
       "and v4.16b, v8.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v9.16b, v1.16b\n"
       "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v11.16b, v3.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v4.16b, v12.16b, v0.16b\n"
       "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "and v4.16b, v12.16b, v0.16b\n"
       "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v12.4s, v12.4s, v4.4s\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v13.4s, v13.4s, v5.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "and v7.16b, v15.16b, v3.16b\n"
       "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v14.4s, v14.4s, v6.4s\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "and v6.16b, v18.16b, v2.16b\n"
       "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v17.4s, v17.4s, v5.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v18.4s, v18.4s, v6.4s\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v4.16b, v20.16b, v0.16b\n"
+      "and v5.16b, v21.16b, v1.16b\n"
       "and v6.16b, v22.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v20.4s, v20.4s, v4.4s\n"
-      "and v7.16b, v23.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v21.4s, v21.4s, v5.4s\n"
       "sqadd v22.4s, v22.4s, v6.4s\n"
+      "and v7.16b, v23.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v23.4s, v23.4s, v7.4s\n"
       "98:"  // Height 4: no shift correction
       "srshl v8.4s, v8.4s, v0.4s\n"
@@ -1746,8 +1746,6 @@ void a64_hybrid_s8qs_dot_6x16 (
       "cmp x10, #0x10\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
-      "srshl v14.4s, v14.4s, v2.4s\n"
-      "srshl v15.4s, v15.4s, v3.4s\n"
       "add v8.4s, v8.4s, v4.4s\n"
       "add v9.4s, v9.4s, v4.4s\n"
       "add v10.4s, v10.4s, v4.4s\n"
@@ -1766,45 +1764,47 @@ void a64_hybrid_s8qs_dot_6x16 (
       "smax v11.4s, v11.4s, v5.4s\n"
       "smax v12.4s, v12.4s, v5.4s\n"
       "smax v13.4s, v13.4s, v5.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
       "add v14.4s, v14.4s, v4.4s\n"
       "add v15.4s, v15.4s, v4.4s\n"
-      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
       "smin v14.4s, v14.4s, v6.4s\n"
       "smin v15.4s, v15.4s, v6.4s\n"
-      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
       "smax v14.4s, v14.4s, v5.4s\n"
       "smax v15.4s, v15.4s, v5.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
       "add v17.4s, v17.4s, v4.4s\n"
       "srshl v18.4s, v18.4s, v2.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
       "srshl v19.4s, v19.4s, v3.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
       "add v18.4s, v18.4s, v4.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
       "add v19.4s, v19.4s, v4.4s\n"
-      "srshl v20.4s, v20.4s, v0.4s\n"
       "smin v18.4s, v18.4s, v6.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
       "smin v19.4s, v19.4s, v6.4s\n"
-      "srshl v21.4s, v21.4s, v1.4s\n"
       "smax v18.4s, v18.4s, v5.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
       "smax v19.4s, v19.4s, v5.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
+      "srshl v21.4s, v21.4s, v1.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
       "srshl v22.4s, v22.4s, v2.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
       "srshl v23.4s, v23.4s, v3.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
       "add v22.4s, v22.4s, v4.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
       "add v23.4s, v23.4s, v4.4s\n"
-      "uzp1 v8.8h, v8.8h, v9.8h\n"
       "smin v22.4s, v22.4s, v6.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
       "smin v23.4s, v23.4s, v6.4s\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
       "smax v22.4s, v22.4s, v5.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
       "smax v23.4s, v23.4s, v5.4s\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
       "uzp1 v13.8h, v14.8h, v15.8h\n"
@@ -2379,63 +2379,63 @@ void a64_hybrid_s8qs_dot_6x16 (
       "sqrdmulh v27.4s, v27.4s, v7.4s\n"
       "tbz %x[flags], #5, 125f\n"
       "and v4.16b, v8.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v9.16b, v1.16b\n"
       "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v11.16b, v3.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v4.16b, v12.16b, v0.16b\n"
       "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "and v4.16b, v12.16b, v0.16b\n"
       "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v12.4s, v12.4s, v4.4s\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v13.4s, v13.4s, v5.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "and v7.16b, v15.16b, v3.16b\n"
       "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v14.4s, v14.4s, v6.4s\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "and v6.16b, v18.16b, v2.16b\n"
       "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v17.4s, v17.4s, v5.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v18.4s, v18.4s, v6.4s\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v4.16b, v20.16b, v0.16b\n"
+      "and v5.16b, v21.16b, v1.16b\n"
       "and v6.16b, v22.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v20.4s, v20.4s, v4.4s\n"
-      "and v7.16b, v23.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v21.4s, v21.4s, v5.4s\n"
-      "and v4.16b, v24.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v22.4s, v22.4s, v6.4s\n"
+      "and v7.16b, v23.16b, v3.16b\n"
+      "and v4.16b, v24.16b, v0.16b\n"
       "and v5.16b, v25.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v23.4s, v23.4s, v7.4s\n"
-      "and v6.16b, v26.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v24.4s, v24.4s, v4.4s\n"
+      "sqadd v25.4s, v25.4s, v5.4s\n"
+      "and v6.16b, v26.16b, v2.16b\n"
       "and v7.16b, v27.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v25.4s, v25.4s, v5.4s\n"
       "sqadd v26.4s, v26.4s, v6.4s\n"
       "sqadd v27.4s, v27.4s, v7.4s\n"
       "125:"  // Height 5: no shift correction
@@ -2452,8 +2452,6 @@ void a64_hybrid_s8qs_dot_6x16 (
       "cmp x10, #0x10\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
-      "srshl v14.4s, v14.4s, v2.4s\n"
-      "srshl v15.4s, v15.4s, v3.4s\n"
       "add v8.4s, v8.4s, v4.4s\n"
       "add v9.4s, v9.4s, v4.4s\n"
       "add v10.4s, v10.4s, v4.4s\n"
@@ -2472,62 +2470,64 @@ void a64_hybrid_s8qs_dot_6x16 (
       "smax v11.4s, v11.4s, v5.4s\n"
       "smax v12.4s, v12.4s, v5.4s\n"
       "smax v13.4s, v13.4s, v5.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
       "add v14.4s, v14.4s, v4.4s\n"
       "add v15.4s, v15.4s, v4.4s\n"
-      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
       "smin v14.4s, v14.4s, v6.4s\n"
       "smin v15.4s, v15.4s, v6.4s\n"
-      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
       "smax v14.4s, v14.4s, v5.4s\n"
       "smax v15.4s, v15.4s, v5.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
       "add v17.4s, v17.4s, v4.4s\n"
       "srshl v18.4s, v18.4s, v2.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
       "srshl v19.4s, v19.4s, v3.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
       "add v18.4s, v18.4s, v4.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
       "add v19.4s, v19.4s, v4.4s\n"
-      "srshl v20.4s, v20.4s, v0.4s\n"
       "smin v18.4s, v18.4s, v6.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
       "smin v19.4s, v19.4s, v6.4s\n"
-      "srshl v21.4s, v21.4s, v1.4s\n"
       "smax v18.4s, v18.4s, v5.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
       "smax v19.4s, v19.4s, v5.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
+      "srshl v21.4s, v21.4s, v1.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
       "srshl v22.4s, v22.4s, v2.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
       "srshl v23.4s, v23.4s, v3.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
       "add v22.4s, v22.4s, v4.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
       "add v23.4s, v23.4s, v4.4s\n"
-      "srshl v24.4s, v24.4s, v0.4s\n"
       "smin v22.4s, v22.4s, v6.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
       "smin v23.4s, v23.4s, v6.4s\n"
-      "srshl v25.4s, v25.4s, v1.4s\n"
       "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
       "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "srshl v26.4s, v26.4s, v2.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "srshl v25.4s, v25.4s, v1.4s\n"
       "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v26.4s, v26.4s, v2.4s\n"
       "srshl v27.4s, v27.4s, v3.4s\n"
       "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
       "add v26.4s, v26.4s, v4.4s\n"
       "add v27.4s, v27.4s, v4.4s\n"
-      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
       "smin v26.4s, v26.4s, v6.4s\n"
       "smin v27.4s, v27.4s, v6.4s\n"
-      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
       "smax v26.4s, v26.4s, v5.4s\n"
       "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
       "uzp1 v13.8h, v14.8h, v15.8h\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
@@ -3190,73 +3190,73 @@ void a64_hybrid_s8qs_dot_6x16 (
       "sqrdmulh v31.4s, v31.4s, v7.4s\n"
       "tbz %x[flags], #5, 152f\n"
       "and v4.16b, v8.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v9.16b, v1.16b\n"
       "and v6.16b, v10.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
-      "and v7.16b, v11.16b, v3.16b\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v8.4s, v8.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
-      "and v4.16b, v12.16b, v0.16b\n"
       "sqadd v9.4s, v9.4s, v5.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v10.4s, v10.4s, v6.4s\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "and v4.16b, v12.16b, v0.16b\n"
       "and v5.16b, v13.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v11.4s, v11.4s, v7.4s\n"
-      "and v6.16b, v14.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v12.4s, v12.4s, v4.4s\n"
-      "and v7.16b, v15.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v13.4s, v13.4s, v5.4s\n"
+      "and v6.16b, v14.16b, v2.16b\n"
+      "and v7.16b, v15.16b, v3.16b\n"
       "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v14.4s, v14.4s, v6.4s\n"
-      "and v5.16b, v17.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v15.4s, v15.4s, v7.4s\n"
-      "and v6.16b, v18.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "and v6.16b, v18.16b, v2.16b\n"
       "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v17.4s, v17.4s, v5.4s\n"
-      "and v4.16b, v20.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v18.4s, v18.4s, v6.4s\n"
-      "and v5.16b, v21.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v19.4s, v19.4s, v7.4s\n"
+      "and v4.16b, v20.16b, v0.16b\n"
+      "and v5.16b, v21.16b, v1.16b\n"
       "and v6.16b, v22.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v20.4s, v20.4s, v4.4s\n"
-      "and v7.16b, v23.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v21.4s, v21.4s, v5.4s\n"
-      "and v4.16b, v24.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v22.4s, v22.4s, v6.4s\n"
+      "and v7.16b, v23.16b, v3.16b\n"
+      "and v4.16b, v24.16b, v0.16b\n"
       "and v5.16b, v25.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v23.4s, v23.4s, v7.4s\n"
-      "and v6.16b, v26.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v24.4s, v24.4s, v4.4s\n"
-      "and v7.16b, v27.16b, v3.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v25.4s, v25.4s, v5.4s\n"
+      "and v6.16b, v26.16b, v2.16b\n"
+      "and v7.16b, v27.16b, v3.16b\n"
       "and v4.16b, v28.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v26.4s, v26.4s, v6.4s\n"
-      "and v5.16b, v29.16b, v1.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v27.4s, v27.4s, v7.4s\n"
-      "and v6.16b, v30.16b, v2.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v28.4s, v28.4s, v4.4s\n"
+      "and v5.16b, v29.16b, v1.16b\n"
+      "and v6.16b, v30.16b, v2.16b\n"
       "and v7.16b, v31.16b, v3.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v29.4s, v29.4s, v5.4s\n"
       "sqadd v30.4s, v30.4s, v6.4s\n"
@@ -3275,8 +3275,6 @@ void a64_hybrid_s8qs_dot_6x16 (
       "cmp x10, #0x10\n"
       "srshl v12.4s, v12.4s, v0.4s\n"
       "srshl v13.4s, v13.4s, v1.4s\n"
-      "srshl v14.4s, v14.4s, v2.4s\n"
-      "srshl v15.4s, v15.4s, v3.4s\n"
       "add v8.4s, v8.4s, v4.4s\n"
       "add v9.4s, v9.4s, v4.4s\n"
       "add v10.4s, v10.4s, v4.4s\n"
@@ -3295,80 +3293,82 @@ void a64_hybrid_s8qs_dot_6x16 (
       "smax v11.4s, v11.4s, v5.4s\n"
       "smax v12.4s, v12.4s, v5.4s\n"
       "smax v13.4s, v13.4s, v5.4s\n"
+      "srshl v14.4s, v14.4s, v2.4s\n"
+      "srshl v15.4s, v15.4s, v3.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
       "add v14.4s, v14.4s, v4.4s\n"
       "add v15.4s, v15.4s, v4.4s\n"
-      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
       "smin v14.4s, v14.4s, v6.4s\n"
       "smin v15.4s, v15.4s, v6.4s\n"
-      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
       "smax v14.4s, v14.4s, v5.4s\n"
       "smax v15.4s, v15.4s, v5.4s\n"
-      "add v16.4s, v16.4s, v4.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
       "add v17.4s, v17.4s, v4.4s\n"
       "srshl v18.4s, v18.4s, v2.4s\n"
-      "smin v16.4s, v16.4s, v6.4s\n"
-      "smin v17.4s, v17.4s, v6.4s\n"
       "srshl v19.4s, v19.4s, v3.4s\n"
-      "smax v16.4s, v16.4s, v5.4s\n"
-      "smax v17.4s, v17.4s, v5.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
       "add v18.4s, v18.4s, v4.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
       "add v19.4s, v19.4s, v4.4s\n"
-      "srshl v20.4s, v20.4s, v0.4s\n"
       "smin v18.4s, v18.4s, v6.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
       "smin v19.4s, v19.4s, v6.4s\n"
-      "srshl v21.4s, v21.4s, v1.4s\n"
       "smax v18.4s, v18.4s, v5.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
       "smax v19.4s, v19.4s, v5.4s\n"
-      "add v20.4s, v20.4s, v4.4s\n"
-      "add v21.4s, v21.4s, v4.4s\n"
+      "srshl v21.4s, v21.4s, v1.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
       "srshl v22.4s, v22.4s, v2.4s\n"
-      "smin v20.4s, v20.4s, v6.4s\n"
-      "smin v21.4s, v21.4s, v6.4s\n"
       "srshl v23.4s, v23.4s, v3.4s\n"
-      "smax v20.4s, v20.4s, v5.4s\n"
-      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
       "add v22.4s, v22.4s, v4.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
       "add v23.4s, v23.4s, v4.4s\n"
-      "srshl v24.4s, v24.4s, v0.4s\n"
       "smin v22.4s, v22.4s, v6.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
       "smin v23.4s, v23.4s, v6.4s\n"
-      "srshl v25.4s, v25.4s, v1.4s\n"
       "smax v22.4s, v22.4s, v5.4s\n"
-      "smax v23.4s, v23.4s, v5.4s\n"
       "add v24.4s, v24.4s, v4.4s\n"
-      "add v25.4s, v25.4s, v4.4s\n"
-      "srshl v26.4s, v26.4s, v2.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "srshl v25.4s, v25.4s, v1.4s\n"
       "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v26.4s, v26.4s, v2.4s\n"
       "srshl v27.4s, v27.4s, v3.4s\n"
       "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
       "add v26.4s, v26.4s, v4.4s\n"
       "add v27.4s, v27.4s, v4.4s\n"
-      "srshl v28.4s, v28.4s, v0.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
       "smin v26.4s, v26.4s, v6.4s\n"
       "smin v27.4s, v27.4s, v6.4s\n"
-      "srshl v29.4s, v29.4s, v1.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
       "smax v26.4s, v26.4s, v5.4s\n"
       "smax v27.4s, v27.4s, v5.4s\n"
+      "srshl v28.4s, v28.4s, v0.4s\n"
+      "srshl v29.4s, v29.4s, v1.4s\n"
+      "srshl v30.4s, v30.4s, v2.4s\n"
+      "srshl v31.4s, v31.4s, v3.4s\n"
       "add v28.4s, v28.4s, v4.4s\n"
       "add v29.4s, v29.4s, v4.4s\n"
-      "srshl v30.4s, v30.4s, v2.4s\n"
+      "add v30.4s, v30.4s, v4.4s\n"
       "smin v28.4s, v28.4s, v6.4s\n"
       "smin v29.4s, v29.4s, v6.4s\n"
-      "srshl v31.4s, v31.4s, v3.4s\n"
+      "smin v30.4s, v30.4s, v6.4s\n"
       "smax v28.4s, v28.4s, v5.4s\n"
       "smax v29.4s, v29.4s, v5.4s\n"
-      "add v30.4s, v30.4s, v4.4s\n"
+      "smax v30.4s, v30.4s, v5.4s\n"
       "add v31.4s, v31.4s, v4.4s\n"
       "uzp1 v8.8h, v8.8h, v9.8h\n"
-      "smin v30.4s, v30.4s, v6.4s\n"
-      "smin v31.4s, v31.4s, v6.4s\n"
       "uzp1 v9.8h, v10.8h, v11.8h\n"
-      "smax v30.4s, v30.4s, v5.4s\n"
-      "smax v31.4s, v31.4s, v5.4s\n"
+      "smin v31.4s, v31.4s, v6.4s\n"
       "uzp1 v12.8h, v12.8h, v13.8h\n"
       "uzp1 v13.8h, v14.8h, v15.8h\n"
+      "smax v31.4s, v31.4s, v5.4s\n"
       "uzp1 v16.8h, v16.8h, v17.8h\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp
new file mode 100644
index 0000000000..7eacdceae7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<int8_t>, \
+    size_t, size_t, \
+    const int8_t *, \
+    IndirectOutputArg<int8_t>, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_s8qs_mmla_6x16( ARGLIST );
+
+class cls_a64_hybrid_s8qs_mmla_6x16
+{
+public:
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 50.42 };
+                case CPUModel::A510:
+                    return { 28.71 };
+                case CPUModel::V1:
+                    return { 77.72 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_s8qs_mmla_6x16;
+    cls_a64_hybrid_s8qs_mmla_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp
new file mode 100644
index 0000000000..8924492e41
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp
@@ -0,0 +1,3640 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qs_mmla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+    struct KernelArgs {
+        const int32_t *multiplier_ptr = {};
+        const int32_t *shift_ptr = {};
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->per_channel_requant) {
+        flags |= 0x10;
+        ka.multiplier_ptr=qp->per_channel_muls + col_base;
+        ka.shift_ptr=qp->per_channel_right_shifts + col_base;
+    }
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 146f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 117f\n"
+      "beq 88f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 59f\n"
+      "beq 30f\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x11, %x[col_bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "2:"  // Height 1: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "3:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "4:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 5f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 6f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "b 6f\n"
+      "5:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "6:"  // Height 1: input setup done
+      "cmp x26, #0x10\n"
+      "blt 9f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 8f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      "movi v2.16b, #0x0\n"
+      "ldr q7, [x28, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x28, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x28, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x28, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      "bge 7b\n"
+      "8:"  // Height 1: Multiply loop: Single iteration only
+      "movi v2.16b, #0x0\n"
+      "ldr q7, [x28, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x28, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x28, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x28, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "9:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x26, 16f\n"
+      "cmp x26, #0x8\n"
+      "blt 11f\n"
+      "10:"  // Height 1: Multiply loop: Odd block loop
+      "movi v2.16b, #0x0\n"
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x28, #0x0]\n"
+      "cmp x26, #0x8\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      "ldr q7, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x40]\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x50]\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      "bge 10b\n"
+      "cbz x26, 16f\n"
+      "11:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 13f\n"
+      "ldr s1, [x25], #0x4\n"
+      "tbz x26, #1, 12f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "tbz x26, #0, 15f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "b 15f\n"
+      "12:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 15f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "b 15f\n"
+      "13:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 14f\n"
+      "ldr h1, [x25], #0x2\n"
+      "tbz x26, #0, 15f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "b 15f\n"
+      "14:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "15:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "movi v2.16b, #0x0\n"
+      "ldr q7, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "16:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 4b\n"
+      "uzp1 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "uzp1 v9.2d, v9.2d, v13.2d\n"
+      "ldr q0, [x11, #0x0]\n"
+      "uzp1 v10.2d, v10.2d, v14.2d\n"
+      "ldr q1, [x11, #0x10]\n"
+      "uzp1 v11.2d, v11.2d, v15.2d\n"
+      "ldr q2, [x11, #0x20]\n"
+      "mov v15.16b, v8.16b\n"
+      "ldr q3, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "add v15.4s, v15.4s, v0.4s\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "tbz %x[flags], #4, 17f\n"
+      "ldr q0, [x12, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q1, [x12, #0x10]\n"
+      "ldr q5, [x13, #0x10]\n"
+      "ldr q2, [x12, #0x20]\n"
+      "ldr q6, [x13, #0x20]\n"
+      "ldr q3, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "ldr q7, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "b 18f\n"
+      "17:"  // Height 1: per layer parameters
+      "add x24, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x24]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x24, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x24]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "18:"  // Height 1: parameters loaded
+      "sqrdmulh v15.4s, v15.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "tbz %x[flags], #5, 19f\n"
+      "and v4.16b, v15.16b, v0.16b\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v4.4s\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "19:"  // Height 1: no shift correction
+      "srshl v15.4s, v15.4s, v0.4s\n"
+      "add x24, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x24]\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add x24, %x[qp], %[minval]\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x24]\n"
+      "add x24, %x[qp], %[maxval]\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x24]\n"
+      "cmp x10, #0x10\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "uzp1 v15.8h, v15.8h, v9.8h\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v15.16b, v15.16b, v9.16b\n"
+      "bge 28f\n"
+      "tbz x10, #3, 23f\n"
+      "str d15, [x9], #0x8\n"
+      "tbz x10, #2, 21f\n"
+      "st1 { v15.s }[2], [x9], #0x4\n"
+      "tbz x10, #1, 20f\n"
+      "st1 { v15.h }[6], [x9], #0x2\n"
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[14], [x9]\n"
+      "b 27f\n"
+      "20:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[12], [x9]\n"
+      "b 27f\n"
+      "21:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 22f\n"
+      "st1 { v15.h }[4], [x9], #0x2\n"
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[10], [x9]\n"
+      "b 27f\n"
+      "22:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[8], [x9]\n"
+      "b 27f\n"
+      "23:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 25f\n"
+      "str s15, [x9], #0x4\n"
+      "tbz x10, #1, 24f\n"
+      "st1 { v15.h }[2], [x9], #0x2\n"
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[6], [x9]\n"
+      "b 27f\n"
+      "24:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[4], [x9]\n"
+      "b 27f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 26f\n"
+      "str h15, [x9], #0x2\n"
+      "tbz x10, #0, 27f\n"
+      "st1 { v15.b }[2], [x9]\n"
+      "b 27f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_1_0
+      "str b15, [x9, #0x0]\n"
+      "27:"  // Height 1: Partial direct writeback: Done
+      "b 29f\n"
+      "28:"  // Height 1: Full writeback
+      "str q15, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "29:"  // Height 1: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 2b\n"
+      "b 176f\n"
+      "30:"  // Height 2
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x11, %x[col_bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "31:"  // Height 2: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "32:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "33:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 34f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 35f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "b 35f\n"
+      "34:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "35:"  // Height 2: input setup done
+      "cmp x26, #0x10\n"
+      "blt 38f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 37f\n"
+      "36:"  // Height 2: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x28, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "sub x26, x26, #0x10\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x80]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x28, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x28, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x28, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      "bge 36b\n"
+      "37:"  // Height 2: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x28, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x28, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x28, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x28, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "38:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x26, 45f\n"
+      "cmp x26, #0x8\n"
+      "blt 40f\n"
+      "39:"  // Height 2: Multiply loop: Odd block loop
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x28, #0x0]\n"
+      "cmp x26, #0x8\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      "ldr q7, [x28, #0x10]\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x40]\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x50]\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      "bge 39b\n"
+      "cbz x26, 45f\n"
+      "40:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 42f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "tbz x26, #1, 41f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "tbz x26, #0, 44f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "b 44f\n"
+      "41:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 44f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "b 44f\n"
+      "42:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 43f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "tbz x26, #0, 44f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "b 44f\n"
+      "43:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "44:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "45:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 33b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x9, x19\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "ldr q0, [x11, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "ldr q1, [x11, #0x10]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "ldr q2, [x11, #0x20]\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "ldr q3, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "mov v15.16b, v7.16b\n"
+      "add v15.4s, v15.4s, v0.4s\n"
+      "add v12.4s, v12.4s, v1.4s\n"
+      "add v13.4s, v13.4s, v2.4s\n"
+      "add v14.4s, v14.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "tbz %x[flags], #4, 46f\n"
+      "ldr q0, [x12, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q1, [x12, #0x10]\n"
+      "ldr q5, [x13, #0x10]\n"
+      "ldr q2, [x12, #0x20]\n"
+      "ldr q6, [x13, #0x20]\n"
+      "ldr q3, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "ldr q7, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "b 47f\n"
+      "46:"  // Height 2: per layer parameters
+      "add x24, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x24]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x24, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x24]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "47:"  // Height 2: parameters loaded
+      "sqrdmulh v15.4s, v15.4s, v4.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v5.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v6.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v7.4s\n"
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "tbz %x[flags], #5, 48f\n"
+      "and v4.16b, v15.16b, v0.16b\n"
+      "and v5.16b, v12.16b, v1.16b\n"
+      "and v6.16b, v13.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v15.4s, v15.4s, v4.4s\n"
+      "sqadd v12.4s, v12.4s, v5.4s\n"
+      "sqadd v13.4s, v13.4s, v6.4s\n"
+      "and v7.16b, v14.16b, v3.16b\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v14.4s, v14.4s, v7.4s\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "48:"  // Height 2: no shift correction
+      "srshl v15.4s, v15.4s, v0.4s\n"
+      "add x24, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x24]\n"
+      "srshl v12.4s, v12.4s, v1.4s\n"
+      "add x24, %x[qp], %[minval]\n"
+      "srshl v13.4s, v13.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x24]\n"
+      "add x24, %x[qp], %[maxval]\n"
+      "srshl v14.4s, v14.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x24]\n"
+      "cmp x10, #0x10\n"
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "uzp1 v15.8h, v15.8h, v12.8h\n"
+      "uzp1 v12.8h, v13.8h, v14.8h\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "uzp1 v15.16b, v15.16b, v12.16b\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "bge 57f\n"
+      "tbz x10, #3, 52f\n"
+      "str d15, [x9], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "tbz x10, #2, 50f\n"
+      "st1 { v15.s }[2], [x9], #0x4\n"
+      "st1 { v8.s }[2], [x23], #0x4\n"
+      "tbz x10, #1, 49f\n"
+      "st1 { v15.h }[6], [x9], #0x2\n"
+      "st1 { v8.h }[6], [x23], #0x2\n"
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[14], [x9]\n"
+      "st1 { v8.b }[14], [x23]\n"
+      "b 56f\n"
+      "49:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[12], [x9]\n"
+      "st1 { v8.b }[12], [x23]\n"
+      "b 56f\n"
+      "50:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 51f\n"
+      "st1 { v15.h }[4], [x9], #0x2\n"
+      "st1 { v8.h }[4], [x23], #0x2\n"
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[10], [x9]\n"
+      "st1 { v8.b }[10], [x23]\n"
+      "b 56f\n"
+      "51:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[8], [x9]\n"
+      "st1 { v8.b }[8], [x23]\n"
+      "b 56f\n"
+      "52:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 54f\n"
+      "str s15, [x9], #0x4\n"
+      "str s8, [x23], #0x4\n"
+      "tbz x10, #1, 53f\n"
+      "st1 { v15.h }[2], [x9], #0x2\n"
+      "st1 { v8.h }[2], [x23], #0x2\n"
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[6], [x9]\n"
+      "st1 { v8.b }[6], [x23]\n"
+      "b 56f\n"
+      "53:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[4], [x9]\n"
+      "st1 { v8.b }[4], [x23]\n"
+      "b 56f\n"
+      "54:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 55f\n"
+      "str h15, [x9], #0x2\n"
+      "str h8, [x23], #0x2\n"
+      "tbz x10, #0, 56f\n"
+      "st1 { v15.b }[2], [x9]\n"
+      "st1 { v8.b }[2], [x23]\n"
+      "b 56f\n"
+      "55:"  // Height 2: Partial direct writeback: partial_1_0
+      "str b15, [x9, #0x0]\n"
+      "str b8, [x23, #0x0]\n"
+      "56:"  // Height 2: Partial direct writeback: Done
+      "b 58f\n"
+      "57:"  // Height 2: Full writeback
+      "str q15, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "str q8, [x23, #0x0]\n"
+      "58:"  // Height 2: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 31b\n"
+      "b 176f\n"
+      "59:"  // Height 3
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x11, %x[col_bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "60:"  // Height 3: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "61:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "62:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 63f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 64f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "b 64f\n"
+      "63:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "64:"  // Height 3: input setup done
+      "cmp x26, #0x10\n"
+      "blt 67f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 66f\n"
+      "65:"  // Height 3: Multiply loop: Main loop head
+      "movi v4.16b, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x28, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x28, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x28, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x28, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      "bge 65b\n"
+      "66:"  // Height 3: Multiply loop: Single iteration only
+      "movi v4.16b, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x28, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x28, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x28, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x28, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      "67:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x26, 74f\n"
+      "cmp x26, #0x8\n"
+      "blt 69f\n"
+      "68:"  // Height 3: Multiply loop: Odd block loop
+      "movi v4.16b, #0x0\n"
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x0]\n"
+      "ldr q7, [x28, #0x10]\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x40]\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x50]\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      "bge 68b\n"
+      "cbz x26, 74f\n"
+      "69:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 71f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "tbz x26, #1, 70f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "ld1 { v3.h }[2], [x23], #0x2\n"
+      "tbz x26, #0, 73f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "ld1 { v3.b }[6], [x23]\n"
+      "b 73f\n"
+      "70:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 73f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "ld1 { v3.b }[4], [x23]\n"
+      "b 73f\n"
+      "71:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 72f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "tbz x26, #0, 73f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "b 73f\n"
+      "72:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "73:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "movi v4.16b, #0x0\n"
+      "ldr q7, [x28, #0x0]\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "74:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 62b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x9, x19\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "ldr q0, [x11, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "add x22, x23, x19\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "ldr q1, [x11, #0x10]\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "ldr q2, [x11, #0x20]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "ldr q3, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "mov v23.16b, v7.16b\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v12.4s, v12.4s, v1.4s\n"
+      "add v13.4s, v13.4s, v2.4s\n"
+      "add v14.4s, v14.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "tbz %x[flags], #4, 75f\n"
+      "ldr q0, [x12, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q1, [x12, #0x10]\n"
+      "ldr q5, [x13, #0x10]\n"
+      "ldr q2, [x12, #0x20]\n"
+      "ldr q6, [x13, #0x20]\n"
+      "ldr q3, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "ldr q7, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "b 76f\n"
+      "75:"  // Height 3: per layer parameters
+      "add x24, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x24]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x24, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x24]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "76:"  // Height 3: parameters loaded
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v5.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v6.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v7.4s\n"
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "tbz %x[flags], #5, 77f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "and v5.16b, v12.16b, v1.16b\n"
+      "and v6.16b, v13.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "sqadd v12.4s, v12.4s, v5.4s\n"
+      "sqadd v13.4s, v13.4s, v6.4s\n"
+      "and v7.16b, v14.16b, v3.16b\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v14.4s, v14.4s, v7.4s\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "and v6.16b, v18.16b, v2.16b\n"
+      "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "77:"  // Height 3: no shift correction
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add x24, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x24]\n"
+      "srshl v12.4s, v12.4s, v1.4s\n"
+      "add x24, %x[qp], %[minval]\n"
+      "srshl v13.4s, v13.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x24]\n"
+      "add x24, %x[qp], %[maxval]\n"
+      "srshl v14.4s, v14.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x24]\n"
+      "cmp x10, #0x10\n"
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "uzp1 v23.8h, v23.8h, v12.8h\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "uzp1 v12.8h, v13.8h, v14.8h\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.16b, v23.16b, v12.16b\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 86f\n"
+      "tbz x10, #3, 81f\n"
+      "str d23, [x9], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "tbz x10, #2, 79f\n"
+      "st1 { v23.s }[2], [x9], #0x4\n"
+      "st1 { v8.s }[2], [x23], #0x4\n"
+      "st1 { v16.s }[2], [x22], #0x4\n"
+      "tbz x10, #1, 78f\n"
+      "st1 { v23.h }[6], [x9], #0x2\n"
+      "st1 { v8.h }[6], [x23], #0x2\n"
+      "st1 { v16.h }[6], [x22], #0x2\n"
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[14], [x9]\n"
+      "st1 { v8.b }[14], [x23]\n"
+      "st1 { v16.b }[14], [x22]\n"
+      "b 85f\n"
+      "78:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[12], [x9]\n"
+      "st1 { v8.b }[12], [x23]\n"
+      "st1 { v16.b }[12], [x22]\n"
+      "b 85f\n"
+      "79:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 80f\n"
+      "st1 { v23.h }[4], [x9], #0x2\n"
+      "st1 { v8.h }[4], [x23], #0x2\n"
+      "st1 { v16.h }[4], [x22], #0x2\n"
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[10], [x9]\n"
+      "st1 { v8.b }[10], [x23]\n"
+      "st1 { v16.b }[10], [x22]\n"
+      "b 85f\n"
+      "80:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[8], [x9]\n"
+      "st1 { v8.b }[8], [x23]\n"
+      "st1 { v16.b }[8], [x22]\n"
+      "b 85f\n"
+      "81:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 83f\n"
+      "str s23, [x9], #0x4\n"
+      "str s8, [x23], #0x4\n"
+      "str s16, [x22], #0x4\n"
+      "tbz x10, #1, 82f\n"
+      "st1 { v23.h }[2], [x9], #0x2\n"
+      "st1 { v8.h }[2], [x23], #0x2\n"
+      "st1 { v16.h }[2], [x22], #0x2\n"
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[6], [x9]\n"
+      "st1 { v8.b }[6], [x23]\n"
+      "st1 { v16.b }[6], [x22]\n"
+      "b 85f\n"
+      "82:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[4], [x9]\n"
+      "st1 { v8.b }[4], [x23]\n"
+      "st1 { v16.b }[4], [x22]\n"
+      "b 85f\n"
+      "83:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 84f\n"
+      "str h23, [x9], #0x2\n"
+      "str h8, [x23], #0x2\n"
+      "str h16, [x22], #0x2\n"
+      "tbz x10, #0, 85f\n"
+      "st1 { v23.b }[2], [x9]\n"
+      "st1 { v8.b }[2], [x23]\n"
+      "st1 { v16.b }[2], [x22]\n"
+      "b 85f\n"
+      "84:"  // Height 3: Partial direct writeback: partial_1_0
+      "str b23, [x9, #0x0]\n"
+      "str b8, [x23, #0x0]\n"
+      "str b16, [x22, #0x0]\n"
+      "85:"  // Height 3: Partial direct writeback: Done
+      "b 87f\n"
+      "86:"  // Height 3: Full writeback
+      "str q23, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "str q8, [x23, #0x0]\n"
+      "str q16, [x22, #0x0]\n"
+      "87:"  // Height 3: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 60b\n"
+      "b 176f\n"
+      "88:"  // Height 4
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x11, %x[col_bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "89:"  // Height 4: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "90:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "91:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 92f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 93f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 93f\n"
+      "92:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "93:"  // Height 4: input setup done
+      "cmp x26, #0x10\n"
+      "blt 96f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 95f\n"
+      "94:"  // Height 4: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x28, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "sub x26, x26, #0x10\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x90]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x28, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x28, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x28, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      "bge 94b\n"
+      "95:"  // Height 4: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x28, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x28, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x28, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x28, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      "96:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x26, 103f\n"
+      "cmp x26, #0x8\n"
+      "blt 98f\n"
+      "97:"  // Height 4: Multiply loop: Odd block loop
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x8\n"
+      "ldr d4, [x22], #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x0]\n"
+      "ldr q7, [x28, #0x10]\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x40]\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x50]\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      "bge 97b\n"
+      "cbz x26, 103f\n"
+      "98:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 100f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "tbz x26, #1, 99f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "ld1 { v3.h }[2], [x23], #0x2\n"
+      "ld1 { v4.h }[2], [x22], #0x2\n"
+      "tbz x26, #0, 102f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "ld1 { v3.b }[6], [x23]\n"
+      "ld1 { v4.b }[6], [x22]\n"
+      "b 102f\n"
+      "99:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 102f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "ld1 { v3.b }[4], [x23]\n"
+      "ld1 { v4.b }[4], [x22]\n"
+      "b 102f\n"
+      "100:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 101f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "tbz x26, #0, 102f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "b 102f\n"
+      "101:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "102:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x28, #0x0]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "103:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 91b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x9, x19\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "ldr q0, [x11, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "add x22, x23, x19\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "ldr q1, [x11, #0x10]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "ldr q2, [x11, #0x20]\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "ldr q3, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "mov v23.16b, v7.16b\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v12.4s, v12.4s, v1.4s\n"
+      "add v13.4s, v13.4s, v2.4s\n"
+      "add v14.4s, v14.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v15.4s, v15.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v1.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "tbz %x[flags], #4, 104f\n"
+      "ldr q0, [x12, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q1, [x12, #0x10]\n"
+      "ldr q5, [x13, #0x10]\n"
+      "ldr q2, [x12, #0x20]\n"
+      "ldr q6, [x13, #0x20]\n"
+      "ldr q3, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "ldr q7, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "b 105f\n"
+      "104:"  // Height 4: per layer parameters
+      "add x24, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x24]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x24, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x24]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "105:"  // Height 4: parameters loaded
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v5.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v6.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v7.4s\n"
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v5.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "tbz %x[flags], #5, 106f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "and v5.16b, v12.16b, v1.16b\n"
+      "and v6.16b, v13.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "sqadd v12.4s, v12.4s, v5.4s\n"
+      "sqadd v13.4s, v13.4s, v6.4s\n"
+      "and v7.16b, v14.16b, v3.16b\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v14.4s, v14.4s, v7.4s\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "and v4.16b, v15.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "sqadd v15.4s, v15.4s, v4.4s\n"
+      "and v5.16b, v20.16b, v1.16b\n"
+      "and v6.16b, v21.16b, v2.16b\n"
+      "and v7.16b, v22.16b, v3.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v6.4s\n"
+      "sqadd v22.4s, v22.4s, v7.4s\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "and v6.16b, v18.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "and v7.16b, v19.16b, v3.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "106:"  // Height 4: no shift correction
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add x24, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x24]\n"
+      "srshl v12.4s, v12.4s, v1.4s\n"
+      "add x24, %x[qp], %[minval]\n"
+      "srshl v13.4s, v13.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x24]\n"
+      "add x24, %x[qp], %[maxval]\n"
+      "srshl v14.4s, v14.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x24]\n"
+      "cmp x10, #0x10\n"
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "srshl v15.4s, v15.4s, v0.4s\n"
+      "srshl v20.4s, v20.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "srshl v21.4s, v21.4s, v2.4s\n"
+      "srshl v22.4s, v22.4s, v3.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "uzp1 v23.8h, v23.8h, v12.8h\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "uzp1 v12.8h, v13.8h, v14.8h\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v15.8h, v15.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.16b, v23.16b, v12.16b\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v15.16b, v15.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 115f\n"
+      "tbz x10, #3, 110f\n"
+      "str d23, [x9], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d15, [x22], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "tbz x10, #2, 108f\n"
+      "st1 { v23.s }[2], [x9], #0x4\n"
+      "st1 { v8.s }[2], [x23], #0x4\n"
+      "st1 { v15.s }[2], [x22], #0x4\n"
+      "st1 { v16.s }[2], [x21], #0x4\n"
+      "tbz x10, #1, 107f\n"
+      "st1 { v23.h }[6], [x9], #0x2\n"
+      "st1 { v8.h }[6], [x23], #0x2\n"
+      "st1 { v15.h }[6], [x22], #0x2\n"
+      "st1 { v16.h }[6], [x21], #0x2\n"
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[14], [x9]\n"
+      "st1 { v8.b }[14], [x23]\n"
+      "st1 { v15.b }[14], [x22]\n"
+      "st1 { v16.b }[14], [x21]\n"
+      "b 114f\n"
+      "107:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[12], [x9]\n"
+      "st1 { v8.b }[12], [x23]\n"
+      "st1 { v15.b }[12], [x22]\n"
+      "st1 { v16.b }[12], [x21]\n"
+      "b 114f\n"
+      "108:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 109f\n"
+      "st1 { v23.h }[4], [x9], #0x2\n"
+      "st1 { v8.h }[4], [x23], #0x2\n"
+      "st1 { v15.h }[4], [x22], #0x2\n"
+      "st1 { v16.h }[4], [x21], #0x2\n"
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[10], [x9]\n"
+      "st1 { v8.b }[10], [x23]\n"
+      "st1 { v15.b }[10], [x22]\n"
+      "st1 { v16.b }[10], [x21]\n"
+      "b 114f\n"
+      "109:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[8], [x9]\n"
+      "st1 { v8.b }[8], [x23]\n"
+      "st1 { v15.b }[8], [x22]\n"
+      "st1 { v16.b }[8], [x21]\n"
+      "b 114f\n"
+      "110:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 112f\n"
+      "str s23, [x9], #0x4\n"
+      "str s8, [x23], #0x4\n"
+      "str s15, [x22], #0x4\n"
+      "str s16, [x21], #0x4\n"
+      "tbz x10, #1, 111f\n"
+      "st1 { v23.h }[2], [x9], #0x2\n"
+      "st1 { v8.h }[2], [x23], #0x2\n"
+      "st1 { v15.h }[2], [x22], #0x2\n"
+      "st1 { v16.h }[2], [x21], #0x2\n"
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[6], [x9]\n"
+      "st1 { v8.b }[6], [x23]\n"
+      "st1 { v15.b }[6], [x22]\n"
+      "st1 { v16.b }[6], [x21]\n"
+      "b 114f\n"
+      "111:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[4], [x9]\n"
+      "st1 { v8.b }[4], [x23]\n"
+      "st1 { v15.b }[4], [x22]\n"
+      "st1 { v16.b }[4], [x21]\n"
+      "b 114f\n"
+      "112:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 113f\n"
+      "str h23, [x9], #0x2\n"
+      "str h8, [x23], #0x2\n"
+      "str h15, [x22], #0x2\n"
+      "str h16, [x21], #0x2\n"
+      "tbz x10, #0, 114f\n"
+      "st1 { v23.b }[2], [x9]\n"
+      "st1 { v8.b }[2], [x23]\n"
+      "st1 { v15.b }[2], [x22]\n"
+      "st1 { v16.b }[2], [x21]\n"
+      "b 114f\n"
+      "113:"  // Height 4: Partial direct writeback: partial_1_0
+      "str b23, [x9, #0x0]\n"
+      "str b8, [x23, #0x0]\n"
+      "str b15, [x22, #0x0]\n"
+      "str b16, [x21, #0x0]\n"
+      "114:"  // Height 4: Partial direct writeback: Done
+      "b 116f\n"
+      "115:"  // Height 4: Full writeback
+      "str q23, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "str q8, [x23, #0x0]\n"
+      "str q15, [x22, #0x0]\n"
+      "str q16, [x21, #0x0]\n"
+      "116:"  // Height 4: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 89b\n"
+      "b 176f\n"
+      "117:"  // Height 5
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x11, %x[col_bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "118:"  // Height 5: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "119:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "120:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 121f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 122f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "b 122f\n"
+      "121:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "122:"  // Height 5: input setup done
+      "cmp x26, #0x10\n"
+      "blt 125f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 124f\n"
+      "123:"  // Height 5: Multiply loop: Main loop head
+      "movi v6.16b, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x28, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x28, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x28, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x28, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x28, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      "bge 123b\n"
+      "124:"  // Height 5: Multiply loop: Single iteration only
+      "movi v6.16b, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x28, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x28, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x28, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x28, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x28, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      "125:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x26, 132f\n"
+      "cmp x26, #0x8\n"
+      "blt 127f\n"
+      "126:"  // Height 5: Multiply loop: Odd block loop
+      "movi v7.4s, #0x0\n"
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x8\n"
+      "ldr d4, [x22], #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d5, [x21], #0x8\n"
+      "ldr q6, [x28, #0x0]\n"
+      "trn1 v4.2d, v5.2d, v7.2d\n"
+      "ldr q7, [x28, #0x10]\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a498  // smmla v24.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49c  // smmla v28.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a499  // smmla v25.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x40]\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49d  // smmla v29.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x50]\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49a  // smmla v26.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49e  // smmla v30.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49b  // smmla v27.4s, v4.16b, v6.16b\n"
+      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49f  // smmla v31.4s, v4.16b, v7.16b\n"
+      "bge 126b\n"
+      "cbz x26, 132f\n"
+      "127:"  // Height 5: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 129f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x21], #0x4\n"
+      "tbz x26, #1, 128f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "ld1 { v3.h }[2], [x23], #0x2\n"
+      "ld1 { v4.h }[2], [x22], #0x2\n"
+      "ld1 { v5.h }[2], [x21], #0x2\n"
+      "tbz x26, #0, 131f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "ld1 { v3.b }[6], [x23]\n"
+      "ld1 { v4.b }[6], [x22]\n"
+      "ld1 { v5.b }[6], [x21]\n"
+      "b 131f\n"
+      "128:"  // Height 5: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 131f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "ld1 { v3.b }[4], [x23]\n"
+      "ld1 { v4.b }[4], [x22]\n"
+      "ld1 { v5.b }[4], [x21]\n"
+      "b 131f\n"
+      "129:"  // Height 5: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 130f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x21], #0x2\n"
+      "tbz x26, #0, 131f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x21]\n"
+      "b 131f\n"
+      "130:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x21, #0x0]\n"
+      "131:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "movi v6.4s, #0x0\n"
+      "ldr q7, [x28, #0x0]\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "132:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 120b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x9, x19\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "ldr q0, [x11, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "add x22, x23, x19\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "add x20, x21, x19\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "ldr q1, [x11, #0x10]\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "ldr q2, [x11, #0x20]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "ldr q3, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "mov v31.16b, v7.16b\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add v12.4s, v12.4s, v1.4s\n"
+      "add v13.4s, v13.4s, v2.4s\n"
+      "add v14.4s, v14.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v15.4s, v15.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v1.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "tbz %x[flags], #4, 133f\n"
+      "ldr q0, [x12, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q1, [x12, #0x10]\n"
+      "ldr q5, [x13, #0x10]\n"
+      "ldr q2, [x12, #0x20]\n"
+      "ldr q6, [x13, #0x20]\n"
+      "ldr q3, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "ldr q7, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "b 134f\n"
+      "133:"  // Height 5: per layer parameters
+      "add x24, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x24]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x24, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x24]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "134:"  // Height 5: parameters loaded
+      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v5.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v6.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v7.4s\n"
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v5.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+      "tbz %x[flags], #5, 135f\n"
+      "and v4.16b, v31.16b, v0.16b\n"
+      "and v5.16b, v12.16b, v1.16b\n"
+      "and v6.16b, v13.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v4.4s\n"
+      "sqadd v12.4s, v12.4s, v5.4s\n"
+      "sqadd v13.4s, v13.4s, v6.4s\n"
+      "and v7.16b, v14.16b, v3.16b\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v14.4s, v14.4s, v7.4s\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "and v4.16b, v15.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "sqadd v15.4s, v15.4s, v4.4s\n"
+      "and v5.16b, v20.16b, v1.16b\n"
+      "and v6.16b, v21.16b, v2.16b\n"
+      "and v7.16b, v22.16b, v3.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v6.4s\n"
+      "sqadd v22.4s, v22.4s, v7.4s\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "and v6.16b, v18.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "and v7.16b, v19.16b, v3.16b\n"
+      "and v4.16b, v24.16b, v0.16b\n"
+      "and v5.16b, v25.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v24.4s, v24.4s, v4.4s\n"
+      "sqadd v25.4s, v25.4s, v5.4s\n"
+      "and v6.16b, v26.16b, v2.16b\n"
+      "and v7.16b, v27.16b, v3.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "sqadd v27.4s, v27.4s, v7.4s\n"
+      "135:"  // Height 5: no shift correction
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "add x24, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x24]\n"
+      "srshl v12.4s, v12.4s, v1.4s\n"
+      "add x24, %x[qp], %[minval]\n"
+      "srshl v13.4s, v13.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x24]\n"
+      "add x24, %x[qp], %[maxval]\n"
+      "srshl v14.4s, v14.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x24]\n"
+      "cmp x10, #0x10\n"
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v31.4s, v31.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v31.4s, v31.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "srshl v15.4s, v15.4s, v0.4s\n"
+      "srshl v20.4s, v20.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "srshl v21.4s, v21.4s, v2.4s\n"
+      "srshl v22.4s, v22.4s, v3.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "srshl v25.4s, v25.4s, v1.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "srshl v26.4s, v26.4s, v2.4s\n"
+      "srshl v27.4s, v27.4s, v3.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v31.8h, v31.8h, v12.8h\n"
+      "uzp1 v12.8h, v13.8h, v14.8h\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "uzp1 v15.8h, v15.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v12.16b\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v15.16b, v15.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "bge 144f\n"
+      "tbz x10, #3, 139f\n"
+      "str d31, [x9], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d15, [x22], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "str d24, [x20], #0x8\n"
+      "tbz x10, #2, 137f\n"
+      "st1 { v31.s }[2], [x9], #0x4\n"
+      "st1 { v8.s }[2], [x23], #0x4\n"
+      "st1 { v15.s }[2], [x22], #0x4\n"
+      "st1 { v16.s }[2], [x21], #0x4\n"
+      "st1 { v24.s }[2], [x20], #0x4\n"
+      "tbz x10, #1, 136f\n"
+      "st1 { v31.h }[6], [x9], #0x2\n"
+      "st1 { v8.h }[6], [x23], #0x2\n"
+      "st1 { v15.h }[6], [x22], #0x2\n"
+      "st1 { v16.h }[6], [x21], #0x2\n"
+      "st1 { v24.h }[6], [x20], #0x2\n"
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[14], [x9]\n"
+      "st1 { v8.b }[14], [x23]\n"
+      "st1 { v15.b }[14], [x22]\n"
+      "st1 { v16.b }[14], [x21]\n"
+      "st1 { v24.b }[14], [x20]\n"
+      "b 143f\n"
+      "136:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[12], [x9]\n"
+      "st1 { v8.b }[12], [x23]\n"
+      "st1 { v15.b }[12], [x22]\n"
+      "st1 { v16.b }[12], [x21]\n"
+      "st1 { v24.b }[12], [x20]\n"
+      "b 143f\n"
+      "137:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 138f\n"
+      "st1 { v31.h }[4], [x9], #0x2\n"
+      "st1 { v8.h }[4], [x23], #0x2\n"
+      "st1 { v15.h }[4], [x22], #0x2\n"
+      "st1 { v16.h }[4], [x21], #0x2\n"
+      "st1 { v24.h }[4], [x20], #0x2\n"
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[10], [x9]\n"
+      "st1 { v8.b }[10], [x23]\n"
+      "st1 { v15.b }[10], [x22]\n"
+      "st1 { v16.b }[10], [x21]\n"
+      "st1 { v24.b }[10], [x20]\n"
+      "b 143f\n"
+      "138:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[8], [x9]\n"
+      "st1 { v8.b }[8], [x23]\n"
+      "st1 { v15.b }[8], [x22]\n"
+      "st1 { v16.b }[8], [x21]\n"
+      "st1 { v24.b }[8], [x20]\n"
+      "b 143f\n"
+      "139:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 141f\n"
+      "str s31, [x9], #0x4\n"
+      "str s8, [x23], #0x4\n"
+      "str s15, [x22], #0x4\n"
+      "str s16, [x21], #0x4\n"
+      "str s24, [x20], #0x4\n"
+      "tbz x10, #1, 140f\n"
+      "st1 { v31.h }[2], [x9], #0x2\n"
+      "st1 { v8.h }[2], [x23], #0x2\n"
+      "st1 { v15.h }[2], [x22], #0x2\n"
+      "st1 { v16.h }[2], [x21], #0x2\n"
+      "st1 { v24.h }[2], [x20], #0x2\n"
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[6], [x9]\n"
+      "st1 { v8.b }[6], [x23]\n"
+      "st1 { v15.b }[6], [x22]\n"
+      "st1 { v16.b }[6], [x21]\n"
+      "st1 { v24.b }[6], [x20]\n"
+      "b 143f\n"
+      "140:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[4], [x9]\n"
+      "st1 { v8.b }[4], [x23]\n"
+      "st1 { v15.b }[4], [x22]\n"
+      "st1 { v16.b }[4], [x21]\n"
+      "st1 { v24.b }[4], [x20]\n"
+      "b 143f\n"
+      "141:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 142f\n"
+      "str h31, [x9], #0x2\n"
+      "str h8, [x23], #0x2\n"
+      "str h15, [x22], #0x2\n"
+      "str h16, [x21], #0x2\n"
+      "str h24, [x20], #0x2\n"
+      "tbz x10, #0, 143f\n"
+      "st1 { v31.b }[2], [x9]\n"
+      "st1 { v8.b }[2], [x23]\n"
+      "st1 { v15.b }[2], [x22]\n"
+      "st1 { v16.b }[2], [x21]\n"
+      "st1 { v24.b }[2], [x20]\n"
+      "b 143f\n"
+      "142:"  // Height 5: Partial direct writeback: partial_1_0
+      "str b31, [x9, #0x0]\n"
+      "str b8, [x23, #0x0]\n"
+      "str b15, [x22, #0x0]\n"
+      "str b16, [x21, #0x0]\n"
+      "str b24, [x20, #0x0]\n"
+      "143:"  // Height 5: Partial direct writeback: Done
+      "b 145f\n"
+      "144:"  // Height 5: Full writeback
+      "str q31, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "str q8, [x23, #0x0]\n"
+      "str q15, [x22, #0x0]\n"
+      "str q16, [x21, #0x0]\n"
+      "str q24, [x20, #0x0]\n"
+      "145:"  // Height 5: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 118b\n"
+      "b 176f\n"
+      "146:"  // Height 6
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x11, %x[col_bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x20, #0x6\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "147:"  // Height 6: Column loop
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "148:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "149:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 150f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 151f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "add x20, x20, x19\n"
+      "b 151f\n"
+      "150:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "add x20, x21, x19\n"
+      "151:"  // Height 6: input setup done
+      "cmp x26, #0x10\n"
+      "blt 154f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 153f\n"
+      "152:"  // Height 6: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x20, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q7, [x28, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "sub x26, x26, #0x10\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x28, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x28, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x28, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x28, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      "bge 152b\n"
+      "153:"  // Height 6: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x20, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q7, [x28, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x28, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x28, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x28, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x28, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x28, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      "154:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x26, 161f\n"
+      "cmp x26, #0x8\n"
+      "blt 156f\n"
+      "155:"  // Height 6: Multiply loop: Odd block loop
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x8\n"
+      "ldr d4, [x22], #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d5, [x21], #0x8\n"
+      "ldr d7, [x20], #0x8\n"
+      "trn1 v4.2d, v5.2d, v7.2d\n"
+      "ldr q6, [x28, #0x0]\n"
+      "ldr q7, [x28, #0x10]\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a498  // smmla v24.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x20]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49c  // smmla v28.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a499  // smmla v25.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x40]\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49d  // smmla v29.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x50]\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49a  // smmla v26.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x60]\n"
+      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49e  // smmla v30.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49b  // smmla v27.4s, v4.16b, v6.16b\n"
+      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49f  // smmla v31.4s, v4.16b, v7.16b\n"
+      "bge 155b\n"
+      "cbz x26, 161f\n"
+      "156:"  // Height 6: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 158f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x21], #0x4\n"
+      "ldr s6, [x20], #0x4\n"
+      "tbz x26, #1, 157f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "ld1 { v3.h }[2], [x23], #0x2\n"
+      "ld1 { v4.h }[2], [x22], #0x2\n"
+      "ld1 { v5.h }[2], [x21], #0x2\n"
+      "ld1 { v6.h }[2], [x20], #0x2\n"
+      "tbz x26, #0, 160f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "ld1 { v3.b }[6], [x23]\n"
+      "ld1 { v4.b }[6], [x22]\n"
+      "ld1 { v5.b }[6], [x21]\n"
+      "ld1 { v6.b }[6], [x20]\n"
+      "b 160f\n"
+      "157:"  // Height 6: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 160f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "ld1 { v3.b }[4], [x23]\n"
+      "ld1 { v4.b }[4], [x22]\n"
+      "ld1 { v5.b }[4], [x21]\n"
+      "ld1 { v6.b }[4], [x20]\n"
+      "b 160f\n"
+      "158:"  // Height 6: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 159f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x21], #0x2\n"
+      "ldr h6, [x20], #0x2\n"
+      "tbz x26, #0, 160f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x21]\n"
+      "ld1 { v6.b }[2], [x20]\n"
+      "b 160f\n"
+      "159:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x21, #0x0]\n"
+      "ldr b6, [x20, #0x0]\n"
+      "160:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x28, #0x0]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "161:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 149b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x9, #0x0]\n"
+      "add x23, x9, x19\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "ldr q0, [x11, #0x0]\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "add x22, x23, x19\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "add x20, x21, x19\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "add x19, x20, x19\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x19, #0x0]\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "ldr q1, [x11, #0x10]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "ldr q2, [x11, #0x20]\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "ldr q3, [x11, #0x30]\n"
+      "add x11, x11, #0x40\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "mov v31.16b, v7.16b\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add v12.4s, v12.4s, v1.4s\n"
+      "add v13.4s, v13.4s, v2.4s\n"
+      "add v14.4s, v14.4s, v3.4s\n"
+      "add v8.4s, v8.4s, v0.4s\n"
+      "add v9.4s, v9.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v2.4s\n"
+      "add v11.4s, v11.4s, v3.4s\n"
+      "add v15.4s, v15.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v1.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v1.4s\n"
+      "add v29.4s, v29.4s, v2.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "tbz %x[flags], #4, 162f\n"
+      "ldr q0, [x12, #0x0]\n"
+      "ldr q4, [x13, #0x0]\n"
+      "ldr q1, [x12, #0x10]\n"
+      "ldr q5, [x13, #0x10]\n"
+      "ldr q2, [x12, #0x20]\n"
+      "ldr q6, [x13, #0x20]\n"
+      "ldr q3, [x12, #0x30]\n"
+      "add x12, x12, #0x40\n"
+      "ldr q7, [x13, #0x30]\n"
+      "add x13, x13, #0x40\n"
+      "b 163f\n"
+      "162:"  // Height 6: per layer parameters
+      "add x24, %x[qp], %[per_layer_right_shift]\n"
+      "ld1r { v0.4s }, [x24]\n"
+      "mov v1.16b, v0.16b\n"
+      "add x24, %x[qp], %[per_layer_mul]\n"
+      "ld1r { v4.4s }, [x24]\n"
+      "mov v2.16b, v0.16b\n"
+      "mov v3.16b, v0.16b\n"
+      "mov v5.16b, v4.16b\n"
+      "mov v6.16b, v4.16b\n"
+      "mov v7.16b, v4.16b\n"
+      "163:"  // Height 6: parameters loaded
+      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "sqrdmulh v12.4s, v12.4s, v5.4s\n"
+      "sqrdmulh v13.4s, v13.4s, v6.4s\n"
+      "sqrdmulh v14.4s, v14.4s, v7.4s\n"
+      "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+      "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+      "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+      "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+      "sqrdmulh v15.4s, v15.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v5.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v6.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v7.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v5.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v6.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v7.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+      "tbz %x[flags], #5, 164f\n"
+      "and v4.16b, v31.16b, v0.16b\n"
+      "and v5.16b, v12.16b, v1.16b\n"
+      "and v6.16b, v13.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v4.4s\n"
+      "sqadd v12.4s, v12.4s, v5.4s\n"
+      "sqadd v13.4s, v13.4s, v6.4s\n"
+      "and v7.16b, v14.16b, v3.16b\n"
+      "and v4.16b, v8.16b, v0.16b\n"
+      "and v5.16b, v9.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v14.4s, v14.4s, v7.4s\n"
+      "sqadd v8.4s, v8.4s, v4.4s\n"
+      "sqadd v9.4s, v9.4s, v5.4s\n"
+      "and v6.16b, v10.16b, v2.16b\n"
+      "and v7.16b, v11.16b, v3.16b\n"
+      "and v4.16b, v15.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v10.4s, v10.4s, v6.4s\n"
+      "sqadd v11.4s, v11.4s, v7.4s\n"
+      "sqadd v15.4s, v15.4s, v4.4s\n"
+      "and v5.16b, v20.16b, v1.16b\n"
+      "and v6.16b, v21.16b, v2.16b\n"
+      "and v7.16b, v22.16b, v3.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v6.4s\n"
+      "sqadd v22.4s, v22.4s, v7.4s\n"
+      "and v4.16b, v16.16b, v0.16b\n"
+      "and v5.16b, v17.16b, v1.16b\n"
+      "and v6.16b, v18.16b, v2.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "and v7.16b, v19.16b, v3.16b\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "and v5.16b, v28.16b, v1.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "sqadd v28.4s, v28.4s, v5.4s\n"
+      "and v6.16b, v29.16b, v2.16b\n"
+      "and v7.16b, v30.16b, v3.16b\n"
+      "and v4.16b, v24.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v29.4s, v29.4s, v6.4s\n"
+      "sqadd v30.4s, v30.4s, v7.4s\n"
+      "sqadd v24.4s, v24.4s, v4.4s\n"
+      "and v5.16b, v25.16b, v1.16b\n"
+      "and v6.16b, v26.16b, v2.16b\n"
+      "and v7.16b, v27.16b, v3.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v5.4s\n"
+      "sqadd v26.4s, v26.4s, v6.4s\n"
+      "sqadd v27.4s, v27.4s, v7.4s\n"
+      "164:"  // Height 6: no shift correction
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "add x24, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x24]\n"
+      "srshl v12.4s, v12.4s, v1.4s\n"
+      "add x24, %x[qp], %[minval]\n"
+      "srshl v13.4s, v13.4s, v2.4s\n"
+      "ld1r { v5.4s }, [x24]\n"
+      "add x24, %x[qp], %[maxval]\n"
+      "srshl v14.4s, v14.4s, v3.4s\n"
+      "ld1r { v6.4s }, [x24]\n"
+      "cmp x10, #0x10\n"
+      "srshl v8.4s, v8.4s, v0.4s\n"
+      "srshl v9.4s, v9.4s, v1.4s\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "add v12.4s, v12.4s, v4.4s\n"
+      "add v13.4s, v13.4s, v4.4s\n"
+      "smin v31.4s, v31.4s, v6.4s\n"
+      "smin v12.4s, v12.4s, v6.4s\n"
+      "smin v13.4s, v13.4s, v6.4s\n"
+      "smax v31.4s, v31.4s, v5.4s\n"
+      "smax v12.4s, v12.4s, v5.4s\n"
+      "smax v13.4s, v13.4s, v5.4s\n"
+      "add v14.4s, v14.4s, v4.4s\n"
+      "add v8.4s, v8.4s, v4.4s\n"
+      "add v9.4s, v9.4s, v4.4s\n"
+      "smin v14.4s, v14.4s, v6.4s\n"
+      "smin v8.4s, v8.4s, v6.4s\n"
+      "smin v9.4s, v9.4s, v6.4s\n"
+      "smax v14.4s, v14.4s, v5.4s\n"
+      "smax v8.4s, v8.4s, v5.4s\n"
+      "smax v9.4s, v9.4s, v5.4s\n"
+      "srshl v10.4s, v10.4s, v2.4s\n"
+      "srshl v11.4s, v11.4s, v3.4s\n"
+      "srshl v15.4s, v15.4s, v0.4s\n"
+      "srshl v20.4s, v20.4s, v1.4s\n"
+      "add v10.4s, v10.4s, v4.4s\n"
+      "add v11.4s, v11.4s, v4.4s\n"
+      "add v15.4s, v15.4s, v4.4s\n"
+      "smin v10.4s, v10.4s, v6.4s\n"
+      "smin v11.4s, v11.4s, v6.4s\n"
+      "smin v15.4s, v15.4s, v6.4s\n"
+      "smax v10.4s, v10.4s, v5.4s\n"
+      "smax v11.4s, v11.4s, v5.4s\n"
+      "smax v15.4s, v15.4s, v5.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "srshl v21.4s, v21.4s, v2.4s\n"
+      "srshl v22.4s, v22.4s, v3.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "srshl v17.4s, v17.4s, v1.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "srshl v18.4s, v18.4s, v2.4s\n"
+      "srshl v19.4s, v19.4s, v3.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "srshl v28.4s, v28.4s, v1.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "srshl v29.4s, v29.4s, v2.4s\n"
+      "srshl v30.4s, v30.4s, v3.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "add v30.4s, v30.4s, v4.4s\n"
+      "smin v28.4s, v28.4s, v6.4s\n"
+      "smin v29.4s, v29.4s, v6.4s\n"
+      "smin v30.4s, v30.4s, v6.4s\n"
+      "smax v28.4s, v28.4s, v5.4s\n"
+      "smax v29.4s, v29.4s, v5.4s\n"
+      "smax v30.4s, v30.4s, v5.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "srshl v25.4s, v25.4s, v1.4s\n"
+      "srshl v26.4s, v26.4s, v2.4s\n"
+      "srshl v27.4s, v27.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "uzp1 v31.8h, v31.8h, v12.8h\n"
+      "uzp1 v12.8h, v13.8h, v14.8h\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "uzp1 v8.8h, v8.8h, v9.8h\n"
+      "uzp1 v9.8h, v10.8h, v11.8h\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v15.8h, v15.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.8h, v23.8h, v28.8h\n"
+      "uzp1 v28.8h, v29.8h, v30.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v12.16b\n"
+      "uzp1 v8.16b, v8.16b, v9.16b\n"
+      "uzp1 v15.16b, v15.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v23.16b, v23.16b, v28.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "bge 173f\n"
+      "tbz x10, #3, 168f\n"
+      "str d31, [x9], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d15, [x22], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "str d23, [x20], #0x8\n"
+      "str d24, [x19], #0x8\n"
+      "tbz x10, #2, 166f\n"
+      "st1 { v31.s }[2], [x9], #0x4\n"
+      "st1 { v8.s }[2], [x23], #0x4\n"
+      "st1 { v15.s }[2], [x22], #0x4\n"
+      "st1 { v16.s }[2], [x21], #0x4\n"
+      "st1 { v23.s }[2], [x20], #0x4\n"
+      "st1 { v24.s }[2], [x19], #0x4\n"
+      "tbz x10, #1, 165f\n"
+      "st1 { v31.h }[6], [x9], #0x2\n"
+      "st1 { v8.h }[6], [x23], #0x2\n"
+      "st1 { v15.h }[6], [x22], #0x2\n"
+      "st1 { v16.h }[6], [x21], #0x2\n"
+      "st1 { v23.h }[6], [x20], #0x2\n"
+      "st1 { v24.h }[6], [x19], #0x2\n"
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[14], [x9]\n"
+      "st1 { v8.b }[14], [x23]\n"
+      "st1 { v15.b }[14], [x22]\n"
+      "st1 { v16.b }[14], [x21]\n"
+      "st1 { v23.b }[14], [x20]\n"
+      "st1 { v24.b }[14], [x19]\n"
+      "b 172f\n"
+      "165:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[12], [x9]\n"
+      "st1 { v8.b }[12], [x23]\n"
+      "st1 { v15.b }[12], [x22]\n"
+      "st1 { v16.b }[12], [x21]\n"
+      "st1 { v23.b }[12], [x20]\n"
+      "st1 { v24.b }[12], [x19]\n"
+      "b 172f\n"
+      "166:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 167f\n"
+      "st1 { v31.h }[4], [x9], #0x2\n"
+      "st1 { v8.h }[4], [x23], #0x2\n"
+      "st1 { v15.h }[4], [x22], #0x2\n"
+      "st1 { v16.h }[4], [x21], #0x2\n"
+      "st1 { v23.h }[4], [x20], #0x2\n"
+      "st1 { v24.h }[4], [x19], #0x2\n"
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[10], [x9]\n"
+      "st1 { v8.b }[10], [x23]\n"
+      "st1 { v15.b }[10], [x22]\n"
+      "st1 { v16.b }[10], [x21]\n"
+      "st1 { v23.b }[10], [x20]\n"
+      "st1 { v24.b }[10], [x19]\n"
+      "b 172f\n"
+      "167:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[8], [x9]\n"
+      "st1 { v8.b }[8], [x23]\n"
+      "st1 { v15.b }[8], [x22]\n"
+      "st1 { v16.b }[8], [x21]\n"
+      "st1 { v23.b }[8], [x20]\n"
+      "st1 { v24.b }[8], [x19]\n"
+      "b 172f\n"
+      "168:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 170f\n"
+      "str s31, [x9], #0x4\n"
+      "str s8, [x23], #0x4\n"
+      "str s15, [x22], #0x4\n"
+      "str s16, [x21], #0x4\n"
+      "str s23, [x20], #0x4\n"
+      "str s24, [x19], #0x4\n"
+      "tbz x10, #1, 169f\n"
+      "st1 { v31.h }[2], [x9], #0x2\n"
+      "st1 { v8.h }[2], [x23], #0x2\n"
+      "st1 { v15.h }[2], [x22], #0x2\n"
+      "st1 { v16.h }[2], [x21], #0x2\n"
+      "st1 { v23.h }[2], [x20], #0x2\n"
+      "st1 { v24.h }[2], [x19], #0x2\n"
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[6], [x9]\n"
+      "st1 { v8.b }[6], [x23]\n"
+      "st1 { v15.b }[6], [x22]\n"
+      "st1 { v16.b }[6], [x21]\n"
+      "st1 { v23.b }[6], [x20]\n"
+      "st1 { v24.b }[6], [x19]\n"
+      "b 172f\n"
+      "169:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[4], [x9]\n"
+      "st1 { v8.b }[4], [x23]\n"
+      "st1 { v15.b }[4], [x22]\n"
+      "st1 { v16.b }[4], [x21]\n"
+      "st1 { v23.b }[4], [x20]\n"
+      "st1 { v24.b }[4], [x19]\n"
+      "b 172f\n"
+      "170:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 171f\n"
+      "str h31, [x9], #0x2\n"
+      "str h8, [x23], #0x2\n"
+      "str h15, [x22], #0x2\n"
+      "str h16, [x21], #0x2\n"
+      "str h23, [x20], #0x2\n"
+      "str h24, [x19], #0x2\n"
+      "tbz x10, #0, 172f\n"
+      "st1 { v31.b }[2], [x9]\n"
+      "st1 { v8.b }[2], [x23]\n"
+      "st1 { v15.b }[2], [x22]\n"
+      "st1 { v16.b }[2], [x21]\n"
+      "st1 { v23.b }[2], [x20]\n"
+      "st1 { v24.b }[2], [x19]\n"
+      "b 172f\n"
+      "171:"  // Height 6: Partial direct writeback: partial_1_0
+      "str b31, [x9, #0x0]\n"
+      "str b8, [x23, #0x0]\n"
+      "str b15, [x22, #0x0]\n"
+      "str b16, [x21, #0x0]\n"
+      "str b23, [x20, #0x0]\n"
+      "str b24, [x19, #0x0]\n"
+      "172:"  // Height 6: Partial direct writeback: Done
+      "b 174f\n"
+      "173:"  // Height 6: Full writeback
+      "str q31, [x9, #0x0]\n"
+      "add x9, x9, #0x10\n"
+      "str q8, [x23, #0x0]\n"
+      "str q15, [x22, #0x0]\n"
+      "str q16, [x21, #0x0]\n"
+      "str q23, [x20, #0x0]\n"
+      "str q24, [x19, #0x0]\n"
+      "174:"  // Height 6: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 147b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 176f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 175f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "175:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "176:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
index 759a78a413..d91c69b8a0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
@@ -22,8 +22,8 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef __aarch64__
 
+#ifdef __aarch64__
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -44,7 +44,8 @@ void a64_hybrid_s8s32_dot_6x16_a55( ARGLIST );
 class cls_a64_hybrid_s8s32_dot_6x16
 {
 public:
-    typedef int8_t operand_type;
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
     typedef int32_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,16 +71,35 @@ public:
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 12.667, 2.0799, 0.2279 };
-            default:
-                return { 29.6736, 11.4025, 0.5591 };
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.65 };
+                case CPUModel::A510:
+                    return { 15.87 };
+                case CPUModel::V1:
+                    return { 54.50 };
+            }
         }
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 9.5238, 2.0799, 0.2279 };
+                default:
+                    return { 29.6736, 11.4025, 0.5591 };
+                case CPUModel::A510:
+                    return { 16.66, 3.92, 0.48 };
+                case CPUModel::V1:
+                    return { 55.40, 19.21, 0.93 };
+            }
+        }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
@@ -99,4 +119,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
index 3566027a50..e47295a766 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
@@ -1819,8 +1819,8 @@ void a64_hybrid_s8s32_dot_6x16 (
       "ld1 { v22.4s }, [x21], #0x10\n"
       "ld1 { v26.4s }, [x20], #0x10\n"
       "tbz x10, #1, 139f\n"
-      "mov x24, #0x38\n"
       "ldr d11, [x28], #0x8\n"
+      "mov x24, #0x38\n"
       "ldr d15, [x23], #0x8\n"
       "ldr d19, [x22], #0x8\n"
       "ldr d23, [x21], #0x8\n"
@@ -1873,8 +1873,8 @@ void a64_hybrid_s8s32_dot_6x16 (
       "ld1 { v20.4s }, [x21], #0x10\n"
       "ld1 { v24.4s }, [x20], #0x10\n"
       "tbz x10, #1, 143f\n"
-      "mov x24, #0x18\n"
       "ldr d9, [x28], #0x8\n"
+      "mov x24, #0x18\n"
       "ldr d13, [x23], #0x8\n"
       "ldr d17, [x22], #0x8\n"
       "ldr d21, [x21], #0x8\n"
@@ -2487,12 +2487,12 @@ void a64_hybrid_s8s32_dot_6x16 (
       "ld1 { v16.4s }, [x22], #0x10\n"
       "ld1 { v20.4s }, [x21], #0x10\n"
       "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v28.4s }, [x19], #0x10\n"
       "ld1 { v9.4s }, [x28], #0x10\n"
       "ld1 { v13.4s }, [x23], #0x10\n"
       "ld1 { v17.4s }, [x22], #0x10\n"
       "ld1 { v21.4s }, [x21], #0x10\n"
       "ld1 { v25.4s }, [x20], #0x10\n"
+      "ld1 { v28.4s }, [x19], #0x10\n"
       "ld1 { v29.4s }, [x19], #0x10\n"
       "tbz x10, #2, 174f\n"
       "ld1 { v10.4s }, [x28], #0x10\n"
@@ -2502,8 +2502,8 @@ void a64_hybrid_s8s32_dot_6x16 (
       "ld1 { v26.4s }, [x20], #0x10\n"
       "ld1 { v30.4s }, [x19], #0x10\n"
       "tbz x10, #1, 173f\n"
-      "mov x24, #0x38\n"
       "ldr d11, [x28], #0x8\n"
+      "mov x24, #0x38\n"
       "ldr d15, [x23], #0x8\n"
       "ldr d19, [x22], #0x8\n"
       "ldr d23, [x21], #0x8\n"
@@ -2563,8 +2563,8 @@ void a64_hybrid_s8s32_dot_6x16 (
       "ld1 { v24.4s }, [x20], #0x10\n"
       "ld1 { v28.4s }, [x19], #0x10\n"
       "tbz x10, #1, 177f\n"
-      "mov x24, #0x18\n"
       "ldr d9, [x28], #0x8\n"
+      "mov x24, #0x18\n"
       "ldr d13, [x23], #0x8\n"
       "ldr d17, [x22], #0x8\n"
       "ldr d21, [x21], #0x8\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp
new file mode 100644
index 0000000000..50ccb6fa3d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16.hpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<int8_t>, \
+    size_t, size_t, \
+    const int8_t *, \
+    IndirectOutputArg<int32_t>, \
+    const int32_t *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_s8s32_mmla_6x16( ARGLIST );
+
+class cls_a64_hybrid_s8s32_mmla_6x16
+{
+public:
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 54.98 };
+                case CPUModel::A510:
+                    return { 30.30 };
+                case CPUModel::V1:
+                    return { 83.71 };
+            }
+        }
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 55.27, 15.25, 0.62 };
+                case CPUModel::A510:
+                    return { 33.62, 3.92, 0.48 };
+                case CPUModel::V1:
+                    return { 86.36, 19.25, 0.92 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_s8s32_mmla_6x16;
+    cls_a64_hybrid_s8s32_mmla_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp
new file mode 100644
index 0000000000..a9f6b06ae1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp
@@ -0,0 +1,3463 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8s32_mmla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+    const int32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 186f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 149f\n"
+      "beq 112f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 75f\n"
+      "beq 38f\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "tbz %x[flags], #0, 13f\n"
+      "cmp x10, #0x10\n"
+      "bge 11f\n"
+      "tbz x10, #3, 6f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "tbz x10, #2, 4f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "tbz x10, #1, 3f\n"
+      "mov x24, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "tbz x10, #0, 10f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "b 10f\n"
+      "3:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x24, #0x30\n"
+      "tbz x10, #0, 10f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "b 10f\n"
+      "4:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x10, #1, 5f\n"
+      "ldr d11, [x28], #0x8\n"
+      "mov x24, #0x28\n"
+      "tbz x10, #0, 10f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "b 10f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x24, #0x20\n"
+      "tbz x10, #0, 10f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "b 10f\n"
+      "6:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x10, #2, 8f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "tbz x10, #1, 7f\n"
+      "ldr d10, [x28], #0x8\n"
+      "mov x24, #0x18\n"
+      "tbz x10, #0, 10f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "b 10f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x24, #0x10\n"
+      "tbz x10, #0, 10f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "b 10f\n"
+      "8:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x10, #1, 9f\n"
+      "ldr d9, [x28], #0x8\n"
+      "mov x24, #0x8\n"
+      "tbz x10, #0, 10f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "b 10f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x24, #0x0\n"
+      "10:"  // Height 1: Partial accumulate: Done
+      "sub x28, x28, x24\n"
+      "b 12f\n"
+      "11:"  // Height 1: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "12:"  // Height 1: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 14f\n"
+      "13:"  // Height 1: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "14:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "15:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 16f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 17f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "b 17f\n"
+      "16:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "17:"  // Height 1: input setup done
+      "cmp x26, #0x10\n"
+      "blt 20f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 19f\n"
+      "18:"  // Height 1: Multiply loop: Main loop head
+      "movi v2.16b, #0x0\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      "bge 18b\n"
+      "19:"  // Height 1: Multiply loop: Single iteration only
+      "movi v2.16b, #0x0\n"
+      "ldr q7, [x9, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "20:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x26, 27f\n"
+      "cmp x26, #0x8\n"
+      "blt 22f\n"
+      "21:"  // Height 1: Multiply loop: Odd block loop
+      "movi v2.16b, #0x0\n"
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x0]\n"
+      "cmp x26, #0x8\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      "ldr q7, [x9, #0x10]\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      "bge 21b\n"
+      "cbz x26, 27f\n"
+      "22:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 24f\n"
+      "ldr s1, [x25], #0x4\n"
+      "tbz x26, #1, 23f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "tbz x26, #0, 26f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "b 26f\n"
+      "23:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 26f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "b 26f\n"
+      "24:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 25f\n"
+      "ldr h1, [x25], #0x2\n"
+      "tbz x26, #0, 26f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "b 26f\n"
+      "25:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "26:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "movi v2.16b, #0x0\n"
+      "ldr q7, [x9, #0x0]\n"
+      "ldr q6, [x9, #0x10]\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "27:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 15b\n"
+      "uzp1 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "uzp1 v9.2d, v9.2d, v13.2d\n"
+      "cmp x10, #0x10\n"
+      "uzp1 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v11.2d, v11.2d, v15.2d\n"
+      "bge 36f\n"
+      "tbz x10, #3, 31f\n"
+      "st1 { v8.4s }, [x28], #0x10\n"
+      "st1 { v9.4s }, [x28], #0x10\n"
+      "tbz x10, #2, 29f\n"
+      "st1 { v10.4s }, [x28], #0x10\n"
+      "tbz x10, #1, 28f\n"
+      "str d11, [x28], #0x8\n"
+      "tbz x10, #0, 35f\n"
+      "st1 { v11.s }[2], [x28]\n"
+      "b 35f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 35f\n"
+      "str s11, [x28, #0x0]\n"
+      "b 35f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 30f\n"
+      "str d10, [x28], #0x8\n"
+      "tbz x10, #0, 35f\n"
+      "st1 { v10.s }[2], [x28]\n"
+      "b 35f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 35f\n"
+      "str s10, [x28, #0x0]\n"
+      "b 35f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 33f\n"
+      "st1 { v8.4s }, [x28], #0x10\n"
+      "tbz x10, #1, 32f\n"
+      "str d9, [x28], #0x8\n"
+      "tbz x10, #0, 35f\n"
+      "st1 { v9.s }[2], [x28]\n"
+      "b 35f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 35f\n"
+      "str s9, [x28, #0x0]\n"
+      "b 35f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 34f\n"
+      "str d8, [x28], #0x8\n"
+      "tbz x10, #0, 35f\n"
+      "st1 { v8.s }[2], [x28]\n"
+      "b 35f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x28, #0x0]\n"
+      "35:"  // Height 1: Partial direct writeback: Done
+      "b 37f\n"
+      "36:"  // Height 1: Full writeback
+      "str q8, [x28, #0x0]\n"
+      "str q9, [x28, #0x10]\n"
+      "str q10, [x28, #0x20]\n"
+      "str q11, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "37:"  // Height 1: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 2b\n"
+      "b 224f\n"
+      "38:"  // Height 2
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "39:"  // Height 2: Column loop
+      "tbz %x[flags], #0, 50f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x10, #0x10\n"
+      "add x23, x28, x19, LSL #2\n"
+      "bge 48f\n"
+      "tbz x10, #3, 43f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x23], #0x10\n"
+      "tbz x10, #2, 41f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "tbz x10, #1, 40f\n"
+      "mov x24, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "tbz x10, #0, 47f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "b 47f\n"
+      "40:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x24, #0x30\n"
+      "tbz x10, #0, 47f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "b 47f\n"
+      "41:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x10, #1, 42f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x24, #0x28\n"
+      "tbz x10, #0, 47f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "b 47f\n"
+      "42:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x24, #0x20\n"
+      "tbz x10, #0, 47f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "b 47f\n"
+      "43:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x10, #2, 45f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "tbz x10, #1, 44f\n"
+      "mov x24, #0x18\n"
+      "ldr d10, [x28], #0x8\n"
+      "ldr d13, [x23], #0x8\n"
+      "tbz x10, #0, 47f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x23]\n"
+      "b 47f\n"
+      "44:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x24, #0x10\n"
+      "tbz x10, #0, 47f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x23, #0x0]\n"
+      "b 47f\n"
+      "45:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x10, #1, 46f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x23], #0x8\n"
+      "mov x24, #0x8\n"
+      "tbz x10, #0, 47f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x23]\n"
+      "b 47f\n"
+      "46:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x24, #0x0\n"
+      "ldr s12, [x23, #0x0]\n"
+      "47:"  // Height 2: Partial accumulate: Done
+      "sub x28, x28, x24\n"
+      "b 49f\n"
+      "48:"  // Height 2: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x23, #0x0]\n"
+      "ldr q13, [x23, #0x10]\n"
+      "ldr q14, [x23, #0x20]\n"
+      "ldr q15, [x23, #0x30]\n"
+      "49:"  // Height 2: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 51f\n"
+      "50:"  // Height 2: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "51:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "52:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 53f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 54f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "b 54f\n"
+      "53:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "54:"  // Height 2: input setup done
+      "cmp x26, #0x10\n"
+      "blt 57f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 56f\n"
+      "55:"  // Height 2: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "sub x26, x26, #0x10\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      "bge 55b\n"
+      "56:"  // Height 2: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "57:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x26, 64f\n"
+      "cmp x26, #0x8\n"
+      "blt 59f\n"
+      "58:"  // Height 2: Multiply loop: Odd block loop
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x0]\n"
+      "cmp x26, #0x8\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      "ldr q7, [x9, #0x10]\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      "bge 58b\n"
+      "cbz x26, 64f\n"
+      "59:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 61f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "tbz x26, #1, 60f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "tbz x26, #0, 63f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "b 63f\n"
+      "60:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 63f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "b 63f\n"
+      "61:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 62f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "tbz x26, #0, 63f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "b 63f\n"
+      "62:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "63:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      "64:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 52b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "cmp x10, #0x10\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "bge 73f\n"
+      "tbz x10, #3, 68f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "tbz x10, #2, 66f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "tbz x10, #1, 65f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "tbz x10, #0, 72f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "b 72f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 72f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "b 72f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 67f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "tbz x10, #0, 72f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "b 72f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 72f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "b 72f\n"
+      "68:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 70f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "tbz x10, #1, 69f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "tbz x10, #0, 72f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "b 72f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 72f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "b 72f\n"
+      "70:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 71f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "tbz x10, #0, 72f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "b 72f\n"
+      "71:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "72:"  // Height 2: Partial direct writeback: Done
+      "b 74f\n"
+      "73:"  // Height 2: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "74:"  // Height 2: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 39b\n"
+      "b 224f\n"
+      "75:"  // Height 3
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "76:"  // Height 3: Column loop
+      "tbz %x[flags], #0, 87f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x10, #0x10\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "bge 85f\n"
+      "tbz x10, #3, 80f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x23], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "tbz x10, #2, 78f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v19.4s }, [x22], #0x10\n"
+      "tbz x10, #1, 77f\n"
+      "mov x24, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "tbz x10, #0, 84f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "b 84f\n"
+      "77:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x24, #0x30\n"
+      "tbz x10, #0, 84f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "b 84f\n"
+      "78:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x10, #1, 79f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x24, #0x28\n"
+      "ldr d19, [x22], #0x8\n"
+      "tbz x10, #0, 84f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "b 84f\n"
+      "79:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x24, #0x20\n"
+      "tbz x10, #0, 84f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "b 84f\n"
+      "80:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x10, #2, 82f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "tbz x10, #1, 81f\n"
+      "mov x24, #0x18\n"
+      "ldr d10, [x28], #0x8\n"
+      "ldr d13, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "tbz x10, #0, 84f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x23]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "b 84f\n"
+      "81:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x24, #0x10\n"
+      "tbz x10, #0, 84f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x23, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "b 84f\n"
+      "82:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x10, #1, 83f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x23], #0x8\n"
+      "mov x24, #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "tbz x10, #0, 84f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x23]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "b 84f\n"
+      "83:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x24, #0x0\n"
+      "ldr s12, [x23, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "84:"  // Height 3: Partial accumulate: Done
+      "sub x28, x28, x24\n"
+      "b 86f\n"
+      "85:"  // Height 3: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x23, #0x0]\n"
+      "ldr q13, [x23, #0x10]\n"
+      "ldr q14, [x23, #0x20]\n"
+      "ldr q15, [x23, #0x30]\n"
+      "ldr q17, [x22, #0x0]\n"
+      "ldr q18, [x22, #0x10]\n"
+      "ldr q19, [x22, #0x20]\n"
+      "ldr q24, [x22, #0x30]\n"
+      "86:"  // Height 3: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 88f\n"
+      "87:"  // Height 3: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "88:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "89:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 90f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 91f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "b 91f\n"
+      "90:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "91:"  // Height 3: input setup done
+      "cmp x26, #0x10\n"
+      "blt 94f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 93f\n"
+      "92:"  // Height 3: Multiply loop: Main loop head
+      "movi v4.16b, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      "bge 92b\n"
+      "93:"  // Height 3: Multiply loop: Single iteration only
+      "movi v4.16b, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      "94:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x26, 101f\n"
+      "cmp x26, #0x8\n"
+      "blt 96f\n"
+      "95:"  // Height 3: Multiply loop: Odd block loop
+      "movi v4.16b, #0x0\n"
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      "bge 95b\n"
+      "cbz x26, 101f\n"
+      "96:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 98f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "tbz x26, #1, 97f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "ld1 { v3.h }[2], [x23], #0x2\n"
+      "tbz x26, #0, 100f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "ld1 { v3.b }[6], [x23]\n"
+      "b 100f\n"
+      "97:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 100f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "ld1 { v3.b }[4], [x23]\n"
+      "b 100f\n"
+      "98:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 99f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "tbz x26, #0, 100f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "b 100f\n"
+      "99:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "100:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "movi v4.16b, #0x0\n"
+      "ldr q7, [x9, #0x0]\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "101:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 89b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "cmp x10, #0x10\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "bge 110f\n"
+      "tbz x10, #3, 105f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "tbz x10, #2, 103f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "tbz x10, #1, 102f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "tbz x10, #0, 109f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "b 109f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 109f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "b 109f\n"
+      "103:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 104f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "tbz x10, #0, 109f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "b 109f\n"
+      "104:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 109f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "b 109f\n"
+      "105:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 107f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "tbz x10, #1, 106f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "tbz x10, #0, 109f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "b 109f\n"
+      "106:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 109f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "b 109f\n"
+      "107:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 108f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "tbz x10, #0, 109f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "b 109f\n"
+      "108:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "109:"  // Height 3: Partial direct writeback: Done
+      "b 111f\n"
+      "110:"  // Height 3: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "111:"  // Height 3: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 76b\n"
+      "b 224f\n"
+      "112:"  // Height 4
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "113:"  // Height 4: Column loop
+      "tbz %x[flags], #0, 124f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x10, #0x10\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "bge 122f\n"
+      "tbz x10, #3, 117f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x23], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "tbz x10, #2, 115f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v19.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "tbz x10, #1, 114f\n"
+      "mov x24, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "tbz x10, #0, 121f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v23.s }[2], [x21]\n"
+      "b 121f\n"
+      "114:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x24, #0x30\n"
+      "tbz x10, #0, 121f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s23, [x21, #0x0]\n"
+      "b 121f\n"
+      "115:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x10, #1, 116f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x24, #0x28\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d22, [x21], #0x8\n"
+      "tbz x10, #0, 121f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "ld1 { v22.s }[2], [x21]\n"
+      "b 121f\n"
+      "116:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x24, #0x20\n"
+      "tbz x10, #0, 121f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "ldr s22, [x21, #0x0]\n"
+      "b 121f\n"
+      "117:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x10, #2, 119f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "tbz x10, #1, 118f\n"
+      "mov x24, #0x18\n"
+      "ldr d10, [x28], #0x8\n"
+      "ldr d13, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
+      "tbz x10, #0, 121f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x23]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "ld1 { v21.s }[2], [x21]\n"
+      "b 121f\n"
+      "118:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x24, #0x10\n"
+      "tbz x10, #0, 121f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x23, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "ldr s21, [x21, #0x0]\n"
+      "b 121f\n"
+      "119:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x10, #1, 120f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x23], #0x8\n"
+      "mov x24, #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d20, [x21], #0x8\n"
+      "tbz x10, #0, 121f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x23]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "ld1 { v20.s }[2], [x21]\n"
+      "b 121f\n"
+      "120:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x24, #0x0\n"
+      "ldr s12, [x23, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "ldr s20, [x21, #0x0]\n"
+      "121:"  // Height 4: Partial accumulate: Done
+      "sub x28, x28, x24\n"
+      "b 123f\n"
+      "122:"  // Height 4: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x23, #0x0]\n"
+      "ldr q13, [x23, #0x10]\n"
+      "ldr q14, [x23, #0x20]\n"
+      "ldr q15, [x23, #0x30]\n"
+      "ldr q17, [x22, #0x0]\n"
+      "ldr q18, [x22, #0x10]\n"
+      "ldr q19, [x22, #0x20]\n"
+      "ldr q24, [x22, #0x30]\n"
+      "ldr q20, [x21, #0x0]\n"
+      "ldr q21, [x21, #0x10]\n"
+      "ldr q22, [x21, #0x20]\n"
+      "ldr q23, [x21, #0x30]\n"
+      "123:"  // Height 4: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 125f\n"
+      "124:"  // Height 4: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "125:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "126:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 127f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 128f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 128f\n"
+      "127:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "128:"  // Height 4: input setup done
+      "cmp x26, #0x10\n"
+      "blt 131f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 130f\n"
+      "129:"  // Height 4: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "sub x26, x26, #0x10\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      "bge 129b\n"
+      "130:"  // Height 4: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      "131:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x26, 138f\n"
+      "cmp x26, #0x8\n"
+      "blt 133f\n"
+      "132:"  // Height 4: Multiply loop: Odd block loop
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x8\n"
+      "ldr d4, [x22], #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      "bge 132b\n"
+      "cbz x26, 138f\n"
+      "133:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 135f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "tbz x26, #1, 134f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "ld1 { v3.h }[2], [x23], #0x2\n"
+      "ld1 { v4.h }[2], [x22], #0x2\n"
+      "tbz x26, #0, 137f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "ld1 { v3.b }[6], [x23]\n"
+      "ld1 { v4.b }[6], [x22]\n"
+      "b 137f\n"
+      "134:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 137f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "ld1 { v3.b }[4], [x23]\n"
+      "ld1 { v4.b }[4], [x22]\n"
+      "b 137f\n"
+      "135:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 136f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "tbz x26, #0, 137f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "b 137f\n"
+      "136:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "137:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "138:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 126b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "cmp x10, #0x10\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "bge 147f\n"
+      "tbz x10, #3, 142f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x21], #0x10\n"
+      "st1 { v17.4s }, [x21], #0x10\n"
+      "tbz x10, #2, 140f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v18.4s }, [x21], #0x10\n"
+      "tbz x10, #1, 139f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d19, [x21], #0x8\n"
+      "tbz x10, #0, 146f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v19.s }[2], [x21]\n"
+      "b 146f\n"
+      "139:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 146f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s19, [x21, #0x0]\n"
+      "b 146f\n"
+      "140:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 141f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d18, [x21], #0x8\n"
+      "tbz x10, #0, 146f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v18.s }[2], [x21]\n"
+      "b 146f\n"
+      "141:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 146f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s18, [x21, #0x0]\n"
+      "b 146f\n"
+      "142:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 144f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x21], #0x10\n"
+      "tbz x10, #1, 143f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d17, [x21], #0x8\n"
+      "tbz x10, #0, 146f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v17.s }[2], [x21]\n"
+      "b 146f\n"
+      "143:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 146f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s17, [x21, #0x0]\n"
+      "b 146f\n"
+      "144:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 145f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d15, [x22], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "tbz x10, #0, 146f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "st1 { v15.s }[2], [x22]\n"
+      "st1 { v16.s }[2], [x21]\n"
+      "b 146f\n"
+      "145:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "str s15, [x22, #0x0]\n"
+      "str s16, [x21, #0x0]\n"
+      "146:"  // Height 4: Partial direct writeback: Done
+      "b 148f\n"
+      "147:"  // Height 4: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q15, [x22, #0x0]\n"
+      "str q20, [x22, #0x10]\n"
+      "str q21, [x22, #0x20]\n"
+      "str q22, [x22, #0x30]\n"
+      "str q16, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "str q18, [x21, #0x20]\n"
+      "str q19, [x21, #0x30]\n"
+      "148:"  // Height 4: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 113b\n"
+      "b 224f\n"
+      "149:"  // Height 5
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "150:"  // Height 5: Column loop
+      "tbz %x[flags], #0, 161f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x10, #0x10\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "bge 159f\n"
+      "tbz x10, #3, 154f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v25.4s }, [x20], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x23], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "ld1 { v26.4s }, [x20], #0x10\n"
+      "tbz x10, #2, 152f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v19.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "ld1 { v27.4s }, [x20], #0x10\n"
+      "tbz x10, #1, 151f\n"
+      "ldr d16, [x28], #0x8\n"
+      "mov x24, #0x38\n"
+      "ldr d15, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d6, [x20], #0x8\n"
+      "tbz x10, #0, 158f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v23.s }[2], [x21]\n"
+      "ld1 { v6.s }[2], [x20]\n"
+      "b 158f\n"
+      "151:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x24, #0x30\n"
+      "tbz x10, #0, 158f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s23, [x21, #0x0]\n"
+      "ldr s6, [x20, #0x0]\n"
+      "b 158f\n"
+      "152:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x10, #1, 153f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x24, #0x28\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d22, [x21], #0x8\n"
+      "ldr d27, [x20], #0x8\n"
+      "tbz x10, #0, 158f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "ld1 { v22.s }[2], [x21]\n"
+      "ld1 { v27.s }[2], [x20]\n"
+      "b 158f\n"
+      "153:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x24, #0x20\n"
+      "tbz x10, #0, 158f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "ldr s22, [x21, #0x0]\n"
+      "ldr s27, [x20, #0x0]\n"
+      "b 158f\n"
+      "154:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x10, #2, 156f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v25.4s }, [x20], #0x10\n"
+      "tbz x10, #1, 155f\n"
+      "ldr d10, [x28], #0x8\n"
+      "mov x24, #0x18\n"
+      "ldr d13, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
+      "ldr d26, [x20], #0x8\n"
+      "tbz x10, #0, 158f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x23]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "ld1 { v21.s }[2], [x21]\n"
+      "ld1 { v26.s }[2], [x20]\n"
+      "b 158f\n"
+      "155:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x24, #0x10\n"
+      "tbz x10, #0, 158f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x23, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "ldr s21, [x21, #0x0]\n"
+      "ldr s26, [x20, #0x0]\n"
+      "b 158f\n"
+      "156:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x10, #1, 157f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x23], #0x8\n"
+      "mov x24, #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d20, [x21], #0x8\n"
+      "ldr d25, [x20], #0x8\n"
+      "tbz x10, #0, 158f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x23]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "ld1 { v20.s }[2], [x21]\n"
+      "ld1 { v25.s }[2], [x20]\n"
+      "b 158f\n"
+      "157:"  // Height 5: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x24, #0x0\n"
+      "ldr s12, [x23, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "ldr s20, [x21, #0x0]\n"
+      "ldr s25, [x20, #0x0]\n"
+      "158:"  // Height 5: Partial accumulate: Done
+      "sub x28, x28, x24\n"
+      "b 160f\n"
+      "159:"  // Height 5: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x23, #0x0]\n"
+      "ldr q13, [x23, #0x10]\n"
+      "ldr q14, [x23, #0x20]\n"
+      "ldr q15, [x23, #0x30]\n"
+      "ldr q17, [x22, #0x0]\n"
+      "ldr q18, [x22, #0x10]\n"
+      "ldr q19, [x22, #0x20]\n"
+      "ldr q24, [x22, #0x30]\n"
+      "ldr q20, [x21, #0x0]\n"
+      "ldr q21, [x21, #0x10]\n"
+      "ldr q22, [x21, #0x20]\n"
+      "ldr q23, [x21, #0x30]\n"
+      "ldr q25, [x20, #0x0]\n"
+      "ldr q26, [x20, #0x10]\n"
+      "ldr q27, [x20, #0x20]\n"
+      "ldr q6, [x20, #0x30]\n"
+      "160:"  // Height 5: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 162f\n"
+      "161:"  // Height 5: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "162:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "163:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 164f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 165f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "b 165f\n"
+      "164:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "165:"  // Height 5: input setup done
+      "cmp x26, #0x10\n"
+      "blt 168f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 167f\n"
+      "166:"  // Height 5: Multiply loop: Main loop head
+      "movi v6.4s, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      "bge 166b\n"
+      "167:"  // Height 5: Multiply loop: Single iteration only
+      "movi v6.4s, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      "168:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x26, 175f\n"
+      "cmp x26, #0x8\n"
+      "blt 170f\n"
+      "169:"  // Height 5: Multiply loop: Odd block loop
+      "movi v7.4s, #0x0\n"
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x8\n"
+      "ldr d4, [x22], #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d5, [x21], #0x8\n"
+      "ldr q6, [x9, #0x0]\n"
+      "trn1 v4.2d, v5.2d, v7.2d\n"
+      "ldr q7, [x9, #0x10]\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a498  // smmla v24.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49c  // smmla v28.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a499  // smmla v25.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49d  // smmla v29.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49a  // smmla v26.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49e  // smmla v30.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49b  // smmla v27.4s, v4.16b, v6.16b\n"
+      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49f  // smmla v31.4s, v4.16b, v7.16b\n"
+      "bge 169b\n"
+      "cbz x26, 175f\n"
+      "170:"  // Height 5: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 172f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x21], #0x4\n"
+      "tbz x26, #1, 171f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "ld1 { v3.h }[2], [x23], #0x2\n"
+      "ld1 { v4.h }[2], [x22], #0x2\n"
+      "ld1 { v5.h }[2], [x21], #0x2\n"
+      "tbz x26, #0, 174f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "ld1 { v3.b }[6], [x23]\n"
+      "ld1 { v4.b }[6], [x22]\n"
+      "ld1 { v5.b }[6], [x21]\n"
+      "b 174f\n"
+      "171:"  // Height 5: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 174f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "ld1 { v3.b }[4], [x23]\n"
+      "ld1 { v4.b }[4], [x22]\n"
+      "ld1 { v5.b }[4], [x21]\n"
+      "b 174f\n"
+      "172:"  // Height 5: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 173f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x21], #0x2\n"
+      "tbz x26, #0, 174f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x21]\n"
+      "b 174f\n"
+      "173:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x21, #0x0]\n"
+      "174:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "movi v6.4s, #0x0\n"
+      "ldr q7, [x9, #0x0]\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "175:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 163b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "cmp x10, #0x10\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "bge 184f\n"
+      "tbz x10, #3, 179f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x21], #0x10\n"
+      "st1 { v17.4s }, [x21], #0x10\n"
+      "st1 { v24.4s }, [x20], #0x10\n"
+      "st1 { v25.4s }, [x20], #0x10\n"
+      "tbz x10, #2, 177f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v18.4s }, [x21], #0x10\n"
+      "st1 { v26.4s }, [x20], #0x10\n"
+      "tbz x10, #1, 176f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d19, [x21], #0x8\n"
+      "str d27, [x20], #0x8\n"
+      "tbz x10, #0, 183f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v19.s }[2], [x21]\n"
+      "st1 { v27.s }[2], [x20]\n"
+      "b 183f\n"
+      "176:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 183f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s19, [x21, #0x0]\n"
+      "str s27, [x20, #0x0]\n"
+      "b 183f\n"
+      "177:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 178f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d18, [x21], #0x8\n"
+      "str d26, [x20], #0x8\n"
+      "tbz x10, #0, 183f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v18.s }[2], [x21]\n"
+      "st1 { v26.s }[2], [x20]\n"
+      "b 183f\n"
+      "178:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 183f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s18, [x21, #0x0]\n"
+      "str s26, [x20, #0x0]\n"
+      "b 183f\n"
+      "179:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 181f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x21], #0x10\n"
+      "st1 { v24.4s }, [x20], #0x10\n"
+      "tbz x10, #1, 180f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d17, [x21], #0x8\n"
+      "str d25, [x20], #0x8\n"
+      "tbz x10, #0, 183f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v17.s }[2], [x21]\n"
+      "st1 { v25.s }[2], [x20]\n"
+      "b 183f\n"
+      "180:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 183f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s17, [x21, #0x0]\n"
+      "str s25, [x20, #0x0]\n"
+      "b 183f\n"
+      "181:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 182f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d15, [x22], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "str d24, [x20], #0x8\n"
+      "tbz x10, #0, 183f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "st1 { v15.s }[2], [x22]\n"
+      "st1 { v16.s }[2], [x21]\n"
+      "st1 { v24.s }[2], [x20]\n"
+      "b 183f\n"
+      "182:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "str s15, [x22, #0x0]\n"
+      "str s16, [x21, #0x0]\n"
+      "str s24, [x20, #0x0]\n"
+      "183:"  // Height 5: Partial direct writeback: Done
+      "b 185f\n"
+      "184:"  // Height 5: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q15, [x22, #0x0]\n"
+      "str q20, [x22, #0x10]\n"
+      "str q21, [x22, #0x20]\n"
+      "str q22, [x22, #0x30]\n"
+      "str q16, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "str q18, [x21, #0x20]\n"
+      "str q19, [x21, #0x30]\n"
+      "str q24, [x20, #0x0]\n"
+      "str q25, [x20, #0x10]\n"
+      "str q26, [x20, #0x20]\n"
+      "str q27, [x20, #0x30]\n"
+      "185:"  // Height 5: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 150b\n"
+      "b 224f\n"
+      "186:"  // Height 6
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x20, #0x18\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "187:"  // Height 6: Column loop
+      "tbz %x[flags], #0, 198f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x10, #0x10\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "add x19, x20, x19, LSL #2\n"
+      "bge 196f\n"
+      "tbz x10, #3, 191f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v25.4s }, [x20], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x23], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "ld1 { v26.4s }, [x20], #0x10\n"
+      "ld1 { v28.4s }, [x19], #0x10\n"
+      "ld1 { v29.4s }, [x19], #0x10\n"
+      "tbz x10, #2, 189f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v19.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "ld1 { v27.4s }, [x20], #0x10\n"
+      "ld1 { v30.4s }, [x19], #0x10\n"
+      "tbz x10, #1, 188f\n"
+      "ldr d16, [x28], #0x8\n"
+      "mov x24, #0x38\n"
+      "ldr d15, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d6, [x20], #0x8\n"
+      "ldr d31, [x19], #0x8\n"
+      "tbz x10, #0, 195f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v23.s }[2], [x21]\n"
+      "ld1 { v6.s }[2], [x20]\n"
+      "ld1 { v31.s }[2], [x19]\n"
+      "b 195f\n"
+      "188:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x24, #0x30\n"
+      "tbz x10, #0, 195f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s23, [x21, #0x0]\n"
+      "ldr s6, [x20, #0x0]\n"
+      "ldr s31, [x19, #0x0]\n"
+      "b 195f\n"
+      "189:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x10, #1, 190f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x24, #0x28\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d22, [x21], #0x8\n"
+      "ldr d27, [x20], #0x8\n"
+      "ldr d30, [x19], #0x8\n"
+      "tbz x10, #0, 195f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "ld1 { v22.s }[2], [x21]\n"
+      "ld1 { v27.s }[2], [x20]\n"
+      "ld1 { v30.s }[2], [x19]\n"
+      "b 195f\n"
+      "190:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x24, #0x20\n"
+      "tbz x10, #0, 195f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "ldr s22, [x21, #0x0]\n"
+      "ldr s27, [x20, #0x0]\n"
+      "ldr s30, [x19, #0x0]\n"
+      "b 195f\n"
+      "191:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x10, #2, 193f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v25.4s }, [x20], #0x10\n"
+      "ld1 { v28.4s }, [x19], #0x10\n"
+      "tbz x10, #1, 192f\n"
+      "ldr d10, [x28], #0x8\n"
+      "mov x24, #0x18\n"
+      "ldr d13, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
+      "ldr d26, [x20], #0x8\n"
+      "ldr d29, [x19], #0x8\n"
+      "tbz x10, #0, 195f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x23]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "ld1 { v21.s }[2], [x21]\n"
+      "ld1 { v26.s }[2], [x20]\n"
+      "ld1 { v29.s }[2], [x19]\n"
+      "b 195f\n"
+      "192:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x24, #0x10\n"
+      "tbz x10, #0, 195f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x23, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "ldr s21, [x21, #0x0]\n"
+      "ldr s26, [x20, #0x0]\n"
+      "ldr s29, [x19, #0x0]\n"
+      "b 195f\n"
+      "193:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x10, #1, 194f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x23], #0x8\n"
+      "mov x24, #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d20, [x21], #0x8\n"
+      "ldr d25, [x20], #0x8\n"
+      "ldr d28, [x19], #0x8\n"
+      "tbz x10, #0, 195f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x23]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "ld1 { v20.s }[2], [x21]\n"
+      "ld1 { v25.s }[2], [x20]\n"
+      "ld1 { v28.s }[2], [x19]\n"
+      "b 195f\n"
+      "194:"  // Height 6: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x24, #0x0\n"
+      "ldr s12, [x23, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "ldr s20, [x21, #0x0]\n"
+      "ldr s25, [x20, #0x0]\n"
+      "ldr s28, [x19, #0x0]\n"
+      "195:"  // Height 6: Partial accumulate: Done
+      "sub x28, x28, x24\n"
+      "b 197f\n"
+      "196:"  // Height 6: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x23, #0x0]\n"
+      "ldr q13, [x23, #0x10]\n"
+      "ldr q14, [x23, #0x20]\n"
+      "ldr q15, [x23, #0x30]\n"
+      "ldr q17, [x22, #0x0]\n"
+      "ldr q18, [x22, #0x10]\n"
+      "ldr q19, [x22, #0x20]\n"
+      "ldr q24, [x22, #0x30]\n"
+      "ldr q20, [x21, #0x0]\n"
+      "ldr q21, [x21, #0x10]\n"
+      "ldr q22, [x21, #0x20]\n"
+      "ldr q23, [x21, #0x30]\n"
+      "ldr q25, [x20, #0x0]\n"
+      "ldr q26, [x20, #0x10]\n"
+      "ldr q27, [x20, #0x20]\n"
+      "ldr q6, [x20, #0x30]\n"
+      "ldr q28, [x19, #0x0]\n"
+      "ldr q29, [x19, #0x10]\n"
+      "ldr q30, [x19, #0x20]\n"
+      "ldr q31, [x19, #0x30]\n"
+      "197:"  // Height 6: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 199f\n"
+      "198:"  // Height 6: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "199:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "200:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 201f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 202f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "add x20, x20, x19\n"
+      "b 202f\n"
+      "201:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "add x20, x21, x19\n"
+      "202:"  // Height 6: input setup done
+      "cmp x26, #0x10\n"
+      "blt 205f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 204f\n"
+      "203:"  // Height 6: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x20, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "sub x26, x26, #0x10\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      "bge 203b\n"
+      "204:"  // Height 6: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x20, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x4e87a428  // smmla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a470  // smmla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b8  // smmla v24.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x4e86a42c  // smmla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a474  // smmla v20.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bc  // smmla v28.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x4e87a429  // smmla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a471  // smmla v17.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4b9  // smmla v25.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x4e86a42d  // smmla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a475  // smmla v21.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bd  // smmla v29.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x4e87a42a  // smmla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a472  // smmla v18.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4ba  // smmla v26.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a476  // smmla v22.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4be  // smmla v30.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x4e87a42b  // smmla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e87a473  // smmla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e87a4bb  // smmla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a477  // smmla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e86a4bf  // smmla v31.4s, v5.16b, v6.16b\n"
+      "205:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x26, 212f\n"
+      "cmp x26, #0x8\n"
+      "blt 207f\n"
+      "206:"  // Height 6: Multiply loop: Odd block loop
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x8\n"
+      "ldr d4, [x22], #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d5, [x21], #0x8\n"
+      "ldr d7, [x20], #0x8\n"
+      "trn1 v4.2d, v5.2d, v7.2d\n"
+      "ldr q6, [x9, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a450  // smmla v16.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a498  // smmla v24.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a454  // smmla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49c  // smmla v28.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a451  // smmla v17.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a499  // smmla v25.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a455  // smmla v21.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49d  // smmla v29.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a452  // smmla v18.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49a  // smmla v26.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x4e87a40e  // smmla v14.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a456  // smmla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49e  // smmla v30.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e86a40b  // smmla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a453  // smmla v19.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49b  // smmla v27.4s, v4.16b, v6.16b\n"
+      ".inst 0x4e87a40f  // smmla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49f  // smmla v31.4s, v4.16b, v7.16b\n"
+      "bge 206b\n"
+      "cbz x26, 212f\n"
+      "207:"  // Height 6: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 209f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x21], #0x4\n"
+      "ldr s6, [x20], #0x4\n"
+      "tbz x26, #1, 208f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "ld1 { v3.h }[2], [x23], #0x2\n"
+      "ld1 { v4.h }[2], [x22], #0x2\n"
+      "ld1 { v5.h }[2], [x21], #0x2\n"
+      "ld1 { v6.h }[2], [x20], #0x2\n"
+      "tbz x26, #0, 211f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "ld1 { v3.b }[6], [x23]\n"
+      "ld1 { v4.b }[6], [x22]\n"
+      "ld1 { v5.b }[6], [x21]\n"
+      "ld1 { v6.b }[6], [x20]\n"
+      "b 211f\n"
+      "208:"  // Height 6: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 211f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "ld1 { v3.b }[4], [x23]\n"
+      "ld1 { v4.b }[4], [x22]\n"
+      "ld1 { v5.b }[4], [x21]\n"
+      "ld1 { v6.b }[4], [x20]\n"
+      "b 211f\n"
+      "209:"  // Height 6: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 210f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x21], #0x2\n"
+      "ldr h6, [x20], #0x2\n"
+      "tbz x26, #0, 211f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x21]\n"
+      "ld1 { v6.b }[2], [x20]\n"
+      "b 211f\n"
+      "210:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x21, #0x0]\n"
+      "ldr b6, [x20, #0x0]\n"
+      "211:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x4e87a408  // smmla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a450  // smmla v16.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a498  // smmla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x4e86a40c  // smmla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49c  // smmla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x4e87a409  // smmla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a451  // smmla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a499  // smmla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x4e86a40d  // smmla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49d  // smmla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x4e87a40a  // smmla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a452  // smmla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49a  // smmla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x4e86a40e  // smmla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49e  // smmla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a453  // smmla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a49b  // smmla v27.4s, v4.16b, v7.16b\n"
+      ".inst 0x4e86a40f  // smmla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a457  // smmla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a49f  // smmla v31.4s, v4.16b, v6.16b\n"
+      "212:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 200b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "cmp x10, #0x10\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "add x19, x20, x19, LSL #2\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x19, #0x0]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "bge 221f\n"
+      "tbz x10, #3, 216f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x21], #0x10\n"
+      "st1 { v17.4s }, [x21], #0x10\n"
+      "st1 { v23.4s }, [x20], #0x10\n"
+      "st1 { v28.4s }, [x20], #0x10\n"
+      "st1 { v24.4s }, [x19], #0x10\n"
+      "st1 { v25.4s }, [x19], #0x10\n"
+      "tbz x10, #2, 214f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v18.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x20], #0x10\n"
+      "st1 { v26.4s }, [x19], #0x10\n"
+      "tbz x10, #1, 213f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d19, [x21], #0x8\n"
+      "str d30, [x20], #0x8\n"
+      "str d27, [x19], #0x8\n"
+      "tbz x10, #0, 220f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v19.s }[2], [x21]\n"
+      "st1 { v30.s }[2], [x20]\n"
+      "st1 { v27.s }[2], [x19]\n"
+      "b 220f\n"
+      "213:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 220f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s19, [x21, #0x0]\n"
+      "str s30, [x20, #0x0]\n"
+      "str s27, [x19, #0x0]\n"
+      "b 220f\n"
+      "214:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 215f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d18, [x21], #0x8\n"
+      "str d29, [x20], #0x8\n"
+      "str d26, [x19], #0x8\n"
+      "tbz x10, #0, 220f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v18.s }[2], [x21]\n"
+      "st1 { v29.s }[2], [x20]\n"
+      "st1 { v26.s }[2], [x19]\n"
+      "b 220f\n"
+      "215:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 220f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s18, [x21, #0x0]\n"
+      "str s29, [x20, #0x0]\n"
+      "str s26, [x19, #0x0]\n"
+      "b 220f\n"
+      "216:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 218f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x21], #0x10\n"
+      "st1 { v23.4s }, [x20], #0x10\n"
+      "st1 { v24.4s }, [x19], #0x10\n"
+      "tbz x10, #1, 217f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d17, [x21], #0x8\n"
+      "str d28, [x20], #0x8\n"
+      "str d25, [x19], #0x8\n"
+      "tbz x10, #0, 220f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v17.s }[2], [x21]\n"
+      "st1 { v28.s }[2], [x20]\n"
+      "st1 { v25.s }[2], [x19]\n"
+      "b 220f\n"
+      "217:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 220f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s17, [x21, #0x0]\n"
+      "str s28, [x20, #0x0]\n"
+      "str s25, [x19, #0x0]\n"
+      "b 220f\n"
+      "218:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 219f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d15, [x22], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "str d23, [x20], #0x8\n"
+      "str d24, [x19], #0x8\n"
+      "tbz x10, #0, 220f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "st1 { v15.s }[2], [x22]\n"
+      "st1 { v16.s }[2], [x21]\n"
+      "st1 { v23.s }[2], [x20]\n"
+      "st1 { v24.s }[2], [x19]\n"
+      "b 220f\n"
+      "219:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "str s15, [x22, #0x0]\n"
+      "str s16, [x21, #0x0]\n"
+      "str s23, [x20, #0x0]\n"
+      "str s24, [x19, #0x0]\n"
+      "220:"  // Height 6: Partial direct writeback: Done
+      "b 222f\n"
+      "221:"  // Height 6: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q15, [x22, #0x0]\n"
+      "str q20, [x22, #0x10]\n"
+      "str q21, [x22, #0x20]\n"
+      "str q22, [x22, #0x30]\n"
+      "str q16, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "str q18, [x21, #0x20]\n"
+      "str q19, [x21, #0x30]\n"
+      "str q23, [x20, #0x0]\n"
+      "str q28, [x20, #0x10]\n"
+      "str q29, [x20, #0x20]\n"
+      "str q30, [x20, #0x30]\n"
+      "str q24, [x19, #0x0]\n"
+      "str q25, [x19, #0x10]\n"
+      "str q26, [x19, #0x20]\n"
+      "str q27, [x19, #0x30]\n"
+      "222:"  // Height 6: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 187b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 224f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 223f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "223:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "224:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
index 5d9d84815a..ebc43425b8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
@@ -22,8 +22,8 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef __aarch64__
 
+#ifdef __aarch64__
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -44,7 +44,8 @@ void a64_hybrid_u8qa_dot_4x16_a55( ARGLIST );
 class cls_a64_hybrid_u8qa_dot_4x16
 {
 public:
-    typedef uint8_t operand_type;
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
     typedef uint8_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,16 +71,24 @@ public:
         return false;
     }
 
-    StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixed<rhs_operand_type, result_type, 4, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 7.5301 };
-            default:
-                return { 27.5482 };
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 7.5301 };
+                default:
+                    return { 27.5482 };
+                case CPUModel::A510:
+                    return { 14.81 };
+                case CPUModel::V1:
+                    return { 48.36 };
+            }
         }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
@@ -99,4 +108,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
index 954e2891fb..c410374357 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp
@@ -406,10 +406,10 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       "b 122f\n"
       "31:"  // Height 2
       "movi v11.4s, #0x0\n"
-      "movi v12.4s, #0x0\n"
-      "movi v15.16b, #0x1\n"
       "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "movi v12.4s, #0x0\n"
       "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "movi v15.16b, #0x1\n"
       "mov x9, %x[col_bias]\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "mov x28, %x[output_ptr]\n"
@@ -853,12 +853,12 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       "b 122f\n"
       "61:"  // Height 3
       "movi v11.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v12.4s, #0x0\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "movi v13.4s, #0x0\n"
-      "movi v15.16b, #0x1\n"
-      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
       "mov x9, %x[col_bias]\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "movi v15.16b, #0x1\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "mov x28, %x[output_ptr]\n"
       "62:"  // Height 3: Column loop
@@ -1426,14 +1426,14 @@ void a64_hybrid_u8qa_dot_4x16_a55 (
       "b 122f\n"
       "91:"  // Height 4
       "movi v11.4s, #0x0\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
       "movi v12.4s, #0x0\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "movi v13.4s, #0x0\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
       "movi v14.4s, #0x0\n"
-      "movi v15.16b, #0x1\n"
-      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
-      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
       "mov x9, %x[col_bias]\n"
-      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "movi v15.16b, #0x1\n"
       "bic %x[flags], %x[flags], #0x80000000\n"
       "mov x28, %x[output_ptr]\n"
       "mov x19, #0x4\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
index 6e85eec204..4fc680c45b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
@@ -283,16 +283,16 @@ void a64_hybrid_u8qa_dot_4x16 (
       "sqrdmulh v19.4s, v19.4s, v4.4s\n"
       "tbz %x[flags], #5, 20f\n"
       "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v17.16b, v0.16b\n"
       "and v6.16b, v18.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "and v7.16b, v19.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v16.4s, v16.4s, v4.4s\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v17.4s, v17.4s, v5.4s\n"
       "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sqadd v19.4s, v19.4s, v7.4s\n"
       "20:"  // Height 1: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
@@ -612,8 +612,8 @@ void a64_hybrid_u8qa_dot_4x16 (
       "ld1r { v2.4s }, [x22]\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
-      "addp v12.4s, v12.4s, v12.4s\n"
       "neg v2.4s, v2.4s\n"
+      "addp v12.4s, v12.4s, v12.4s\n"
       "mul v11.4s, v11.4s, v2.4s\n"
       "mul v12.4s, v12.4s, v2.4s\n"
       "49:"  // Height 2: skip row sum fixup
@@ -653,27 +653,27 @@ void a64_hybrid_u8qa_dot_4x16 (
       "sqrdmulh v23.4s, v23.4s, v4.4s\n"
       "tbz %x[flags], #5, 50f\n"
       "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v17.16b, v0.16b\n"
       "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
       "and v7.16b, v19.16b, v0.16b\n"
       "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "and v9.16b, v21.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v10.16b, v22.16b, v0.16b\n"
       "sshr v8.4s, v8.4s, #0x1f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
       "sshr v9.4s, v9.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v19.4s, v19.4s, v7.4s\n"
       "sqadd v20.4s, v20.4s, v8.4s\n"
       "sqadd v21.4s, v21.4s, v9.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sqadd v22.4s, v22.4s, v10.4s\n"
       "sqadd v23.4s, v23.4s, v4.4s\n"
       "50:"  // Height 2: no shift correction
@@ -690,8 +690,6 @@ void a64_hybrid_u8qa_dot_4x16 (
       "cmp x9, #0x10\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
-      "srshl v22.4s, v22.4s, v0.4s\n"
-      "srshl v23.4s, v23.4s, v0.4s\n"
       "add v16.4s, v16.4s, v4.4s\n"
       "add v17.4s, v17.4s, v4.4s\n"
       "add v18.4s, v18.4s, v4.4s\n"
@@ -710,16 +708,18 @@ void a64_hybrid_u8qa_dot_4x16 (
       "smax v19.4s, v19.4s, v5.4s\n"
       "smax v20.4s, v20.4s, v5.4s\n"
       "smax v21.4s, v21.4s, v5.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
       "add v22.4s, v22.4s, v4.4s\n"
       "add v23.4s, v23.4s, v4.4s\n"
-      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v20.8h, v20.8h, v21.8h\n"
       "smin v22.4s, v22.4s, v6.4s\n"
       "smin v23.4s, v23.4s, v6.4s\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
       "smax v22.4s, v22.4s, v5.4s\n"
       "smax v23.4s, v23.4s, v5.4s\n"
-      "uzp1 v20.8h, v20.8h, v21.8h\n"
-      "uzp1 v16.16b, v16.16b, v17.16b\n"
       "uzp1 v21.8h, v22.8h, v23.8h\n"
       "uzp1 v20.16b, v20.16b, v21.16b\n"
       "bge 59f\n"
@@ -1094,9 +1094,9 @@ void a64_hybrid_u8qa_dot_4x16 (
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
+      "neg v3.4s, v3.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
-      "neg v3.4s, v3.4s\n"
       "mul v11.4s, v11.4s, v3.4s\n"
       "mul v12.4s, v12.4s, v3.4s\n"
       "mul v13.4s, v13.4s, v3.4s\n"
@@ -1149,39 +1149,39 @@ void a64_hybrid_u8qa_dot_4x16 (
       "sqrdmulh v27.4s, v27.4s, v4.4s\n"
       "tbz %x[flags], #5, 80f\n"
       "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v17.16b, v0.16b\n"
       "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
       "and v7.16b, v19.16b, v0.16b\n"
       "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "and v9.16b, v21.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v10.16b, v22.16b, v0.16b\n"
       "sshr v8.4s, v8.4s, #0x1f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
       "sshr v9.4s, v9.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v19.4s, v19.4s, v7.4s\n"
       "sqadd v20.4s, v20.4s, v8.4s\n"
       "sqadd v21.4s, v21.4s, v9.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v22.4s, v22.4s, v10.4s\n"
       "sqadd v23.4s, v23.4s, v4.4s\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
       "and v7.16b, v26.16b, v0.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "and v8.16b, v27.16b, v0.16b\n"
-      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
       "sqadd v26.4s, v26.4s, v7.4s\n"
       "sqadd v27.4s, v27.4s, v8.4s\n"
       "80:"  // Height 3: no shift correction
@@ -1198,8 +1198,6 @@ void a64_hybrid_u8qa_dot_4x16 (
       "cmp x9, #0x10\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
-      "srshl v22.4s, v22.4s, v0.4s\n"
-      "srshl v23.4s, v23.4s, v0.4s\n"
       "add v16.4s, v16.4s, v4.4s\n"
       "add v17.4s, v17.4s, v4.4s\n"
       "add v18.4s, v18.4s, v4.4s\n"
@@ -1218,31 +1216,33 @@ void a64_hybrid_u8qa_dot_4x16 (
       "smax v19.4s, v19.4s, v5.4s\n"
       "smax v20.4s, v20.4s, v5.4s\n"
       "smax v21.4s, v21.4s, v5.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
       "add v22.4s, v22.4s, v4.4s\n"
       "add v23.4s, v23.4s, v4.4s\n"
-      "srshl v24.4s, v24.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
       "smin v22.4s, v22.4s, v6.4s\n"
       "smin v23.4s, v23.4s, v6.4s\n"
-      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
       "smax v22.4s, v22.4s, v5.4s\n"
       "smax v23.4s, v23.4s, v5.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
       "add v25.4s, v25.4s, v4.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
       "add v26.4s, v26.4s, v4.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
       "add v27.4s, v27.4s, v4.4s\n"
-      "uzp1 v16.8h, v16.8h, v17.8h\n"
       "smin v26.4s, v26.4s, v6.4s\n"
-      "smin v27.4s, v27.4s, v6.4s\n"
       "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
       "smax v26.4s, v26.4s, v5.4s\n"
-      "smax v27.4s, v27.4s, v5.4s\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
       "uzp1 v21.8h, v22.8h, v23.8h\n"
       "uzp1 v24.8h, v24.8h, v25.8h\n"
       "uzp1 v25.8h, v26.8h, v27.8h\n"
@@ -1705,10 +1705,10 @@ void a64_hybrid_u8qa_dot_4x16 (
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
       "addp v11.4s, v11.4s, v11.4s\n"
+      "neg v4.4s, v4.4s\n"
       "addp v12.4s, v12.4s, v12.4s\n"
       "addp v13.4s, v13.4s, v13.4s\n"
       "addp v14.4s, v14.4s, v14.4s\n"
-      "neg v4.4s, v4.4s\n"
       "mul v11.4s, v11.4s, v4.4s\n"
       "mul v12.4s, v12.4s, v4.4s\n"
       "mul v13.4s, v13.4s, v4.4s\n"
@@ -1774,52 +1774,52 @@ void a64_hybrid_u8qa_dot_4x16 (
       "sqrdmulh v31.4s, v31.4s, v4.4s\n"
       "tbz %x[flags], #5, 110f\n"
       "and v4.16b, v16.16b, v0.16b\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
       "and v5.16b, v17.16b, v0.16b\n"
       "and v6.16b, v18.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
       "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v16.4s, v16.4s, v4.4s\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
       "and v7.16b, v19.16b, v0.16b\n"
       "and v8.16b, v20.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "and v9.16b, v21.16b, v0.16b\n"
       "sshr v7.4s, v7.4s, #0x1f\n"
-      "sqadd v16.4s, v16.4s, v4.4s\n"
-      "and v10.16b, v22.16b, v0.16b\n"
       "sshr v8.4s, v8.4s, #0x1f\n"
-      "and v4.16b, v23.16b, v0.16b\n"
       "sshr v9.4s, v9.4s, #0x1f\n"
-      "sqadd v17.4s, v17.4s, v5.4s\n"
-      "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v18.4s, v18.4s, v6.4s\n"
-      "sshr v4.4s, v4.4s, #0x1f\n"
-      "and v5.16b, v24.16b, v0.16b\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v19.4s, v19.4s, v7.4s\n"
       "sqadd v20.4s, v20.4s, v8.4s\n"
       "sqadd v21.4s, v21.4s, v9.4s\n"
+      "and v10.16b, v22.16b, v0.16b\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v22.4s, v22.4s, v10.4s\n"
       "sqadd v23.4s, v23.4s, v4.4s\n"
-      "and v6.16b, v25.16b, v0.16b\n"
-      "sshr v6.4s, v6.4s, #0x1f\n"
       "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
       "and v7.16b, v26.16b, v0.16b\n"
-      "sshr v7.4s, v7.4s, #0x1f\n"
       "and v8.16b, v27.16b, v0.16b\n"
-      "and v9.16b, v28.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
       "sshr v8.4s, v8.4s, #0x1f\n"
       "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sqadd v26.4s, v26.4s, v7.4s\n"
+      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "and v9.16b, v28.16b, v0.16b\n"
       "and v10.16b, v29.16b, v0.16b\n"
-      "sshr v9.4s, v9.4s, #0x1f\n"
       "and v4.16b, v30.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
       "sshr v10.4s, v10.4s, #0x1f\n"
-      "sqadd v26.4s, v26.4s, v7.4s\n"
-      "and v5.16b, v31.16b, v0.16b\n"
       "sshr v4.4s, v4.4s, #0x1f\n"
-      "sqadd v27.4s, v27.4s, v8.4s\n"
-      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v28.4s, v28.4s, v9.4s\n"
       "sqadd v29.4s, v29.4s, v10.4s\n"
       "sqadd v30.4s, v30.4s, v4.4s\n"
+      "and v5.16b, v31.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
       "sqadd v31.4s, v31.4s, v5.4s\n"
       "110:"  // Height 4: no shift correction
       "srshl v16.4s, v16.4s, v0.4s\n"
@@ -1835,8 +1835,6 @@ void a64_hybrid_u8qa_dot_4x16 (
       "cmp x9, #0x10\n"
       "srshl v20.4s, v20.4s, v0.4s\n"
       "srshl v21.4s, v21.4s, v0.4s\n"
-      "srshl v22.4s, v22.4s, v0.4s\n"
-      "srshl v23.4s, v23.4s, v0.4s\n"
       "add v16.4s, v16.4s, v4.4s\n"
       "add v17.4s, v17.4s, v4.4s\n"
       "add v18.4s, v18.4s, v4.4s\n"
@@ -1855,45 +1853,47 @@ void a64_hybrid_u8qa_dot_4x16 (
       "smax v19.4s, v19.4s, v5.4s\n"
       "smax v20.4s, v20.4s, v5.4s\n"
       "smax v21.4s, v21.4s, v5.4s\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
       "add v22.4s, v22.4s, v4.4s\n"
       "add v23.4s, v23.4s, v4.4s\n"
-      "srshl v24.4s, v24.4s, v0.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
       "smin v22.4s, v22.4s, v6.4s\n"
       "smin v23.4s, v23.4s, v6.4s\n"
-      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
       "smax v22.4s, v22.4s, v5.4s\n"
       "smax v23.4s, v23.4s, v5.4s\n"
-      "add v24.4s, v24.4s, v4.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
       "add v25.4s, v25.4s, v4.4s\n"
       "srshl v26.4s, v26.4s, v0.4s\n"
-      "smin v24.4s, v24.4s, v6.4s\n"
-      "smin v25.4s, v25.4s, v6.4s\n"
       "srshl v27.4s, v27.4s, v0.4s\n"
-      "smax v24.4s, v24.4s, v5.4s\n"
-      "smax v25.4s, v25.4s, v5.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "srshl v28.4s, v28.4s, v0.4s\n"
       "add v26.4s, v26.4s, v4.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
       "add v27.4s, v27.4s, v4.4s\n"
-      "srshl v28.4s, v28.4s, v0.4s\n"
       "smin v26.4s, v26.4s, v6.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
       "smin v27.4s, v27.4s, v6.4s\n"
-      "srshl v29.4s, v29.4s, v0.4s\n"
       "smax v26.4s, v26.4s, v5.4s\n"
+      "smin v28.4s, v28.4s, v6.4s\n"
       "smax v27.4s, v27.4s, v5.4s\n"
-      "add v28.4s, v28.4s, v4.4s\n"
-      "add v29.4s, v29.4s, v4.4s\n"
+      "srshl v29.4s, v29.4s, v0.4s\n"
+      "smax v28.4s, v28.4s, v5.4s\n"
       "srshl v30.4s, v30.4s, v0.4s\n"
-      "smin v28.4s, v28.4s, v6.4s\n"
-      "smin v29.4s, v29.4s, v6.4s\n"
       "srshl v31.4s, v31.4s, v0.4s\n"
-      "smax v28.4s, v28.4s, v5.4s\n"
-      "smax v29.4s, v29.4s, v5.4s\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
       "add v30.4s, v30.4s, v4.4s\n"
+      "smin v29.4s, v29.4s, v6.4s\n"
       "add v31.4s, v31.4s, v4.4s\n"
-      "uzp1 v16.8h, v16.8h, v17.8h\n"
       "smin v30.4s, v30.4s, v6.4s\n"
+      "smax v29.4s, v29.4s, v5.4s\n"
       "smin v31.4s, v31.4s, v6.4s\n"
-      "uzp1 v17.8h, v18.8h, v19.8h\n"
       "smax v30.4s, v30.4s, v5.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
       "smax v31.4s, v31.4s, v5.4s\n"
       "uzp1 v20.8h, v20.8h, v21.8h\n"
       "uzp1 v21.8h, v22.8h, v23.8h\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp
new file mode 100644
index 0000000000..8a47701a4a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<uint8_t>, \
+    size_t, size_t, \
+    const uint8_t *, \
+    IndirectOutputArg<uint8_t>, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_u8qa_mmla_4x16( ARGLIST );
+
+class cls_a64_hybrid_u8qa_mmla_4x16
+{
+public:
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
+    typedef uint8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 4, 16, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 47.68 };
+                case CPUModel::A510:
+                    return { 28.00 };
+                case CPUModel::V1:
+                    return { 68.98 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_u8qa_mmla_4x16;
+    cls_a64_hybrid_u8qa_mmla_4x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp
new file mode 100644
index 0000000000..daeb986529
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp
@@ -0,0 +1,2104 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8qa_mmla_4x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 97f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 65f\n"
+      "beq 33f\n"
+      "movi v11.4s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "movi v15.16b, #0x1\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[col_bias]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov x26, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "3:"  // Height 1: setup done
+      "mov x25, #0x0\n"
+      "4:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 5f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "cbnz x25, 6f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "b 6f\n"
+      "5:"  // Height 1: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "6:"  // Height 1: input setup done
+      "cmp x24, #0x10\n"
+      "blt 11f\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q5, [x28, #0x0]\n"
+      "cmp x24, #0x20\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      "blt 9f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 8f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      "8:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "sub x24, x24, #0x10\n"
+      "ldr q1, [x23, #0x0]\n"
+      "cmp x24, #0x20\n"
+      "ldr q5, [x28, #0x0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      "bge 7b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "sub x24, x24, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 10f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      "10:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "11:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x24, 20f\n"
+      "cmp x24, #0x8\n"
+      "blt 14f\n"
+      "12:"  // Height 1: Multiply loop: Odd block loop
+      "movi v2.16b, #0x0\n"
+      "ldr d1, [x23], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "tbnz %x[flags], #31, 13f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "13:"  // Height 1: Multiply loop: unique 3: skip row sum
+      "ldr q8, [x28, #0x0]\n"
+      ".inst 0x6e88a410  // ummla v16.4s, v0.16b, v8.16b\n"
+      "ldr q9, [x28, #0x10]\n"
+      "sub x24, x24, #0x8\n"
+      ".inst 0x6e89a414  // ummla v20.4s, v0.16b, v9.16b\n"
+      "ldr q10, [x28, #0x20]\n"
+      "cmp x24, #0x8\n"
+      ".inst 0x6e8aa411  // ummla v17.4s, v0.16b, v10.16b\n"
+      "ldr q4, [x28, #0x30]\n"
+      "ldr q5, [x28, #0x40]\n"
+      ".inst 0x6e84a415  // ummla v21.4s, v0.16b, v4.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      "ldr q8, [x28, #0x70]\n"
+      ".inst 0x6e86a416  // ummla v22.4s, v0.16b, v6.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x6e87a413  // ummla v19.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e88a417  // ummla v23.4s, v0.16b, v8.16b\n"
+      "bge 12b\n"
+      "cbz x24, 20f\n"
+      "14:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x24, #2, 16f\n"
+      "ldr s1, [x23], #0x4\n"
+      "tbz x24, #1, 15f\n"
+      "ld1 { v1.h }[2], [x23], #0x2\n"
+      "tbz x24, #0, 18f\n"
+      "ld1 { v1.b }[6], [x23]\n"
+      "b 18f\n"
+      "15:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x24, #0, 18f\n"
+      "ld1 { v1.b }[4], [x23]\n"
+      "b 18f\n"
+      "16:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x24, #1, 17f\n"
+      "ldr h1, [x23], #0x2\n"
+      "tbz x24, #0, 18f\n"
+      "ld1 { v1.b }[2], [x23]\n"
+      "b 18f\n"
+      "17:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x23, #0x0]\n"
+      "18:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "movi v2.16b, #0x0\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "tbnz %x[flags], #31, 19f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "19:"  // Height 1: Multiply loop: unique 4: skip row sum
+      "ldr q10, [x28, #0x0]\n"
+      ".inst 0x6e8aa410  // ummla v16.4s, v0.16b, v10.16b\n"
+      "ldr q4, [x28, #0x10]\n"
+      "ldr q5, [x28, #0x20]\n"
+      ".inst 0x6e84a414  // ummla v20.4s, v0.16b, v4.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x6e85a411  // ummla v17.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      "ldr q8, [x28, #0x50]\n"
+      ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
+      "ldr q9, [x28, #0x60]\n"
+      "ldr q10, [x28, #0x70]\n"
+      ".inst 0x6e87a412  // ummla v18.4s, v0.16b, v7.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x6e88a416  // ummla v22.4s, v0.16b, v8.16b\n"
+      ".inst 0x6e89a413  // ummla v19.4s, v0.16b, v9.16b\n"
+      ".inst 0x6e8aa417  // ummla v23.4s, v0.16b, v10.16b\n"
+      "20:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 4b\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "mov v23.16b, v16.16b\n"
+      "tbnz %x[flags], #31, 21f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1r { v1.4s }, [x22]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "neg v1.4s, v1.4s\n"
+      "mul v11.4s, v11.4s, v1.4s\n"
+      "21:"  // Height 1: skip row sum fixup
+      "add v23.4s, v23.4s, v11.4s\n"
+      "ldr q0, [x27, #0x0]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v17.4s, v17.4s, v11.4s\n"
+      "ldr q1, [x27, #0x10]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add v18.4s, v18.4s, v11.4s\n"
+      "ldr q2, [x27, #0x20]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add v19.4s, v19.4s, v11.4s\n"
+      "ldr q3, [x27, #0x30]\n"
+      "add x27, x27, #0x40\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x23]\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "tbz %x[flags], #5, 22f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "and v5.16b, v17.16b, v0.16b\n"
+      "and v6.16b, v18.16b, v0.16b\n"
+      "and v7.16b, v19.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "sqadd v17.4s, v17.4s, v5.4s\n"
+      "sqadd v18.4s, v18.4s, v6.4s\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sqadd v19.4s, v19.4s, v7.4s\n"
+      "22:"  // Height 1: no shift correction
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add x22, %x[qp], %[minval]\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x22]\n"
+      "cmp x9, #0x10\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "uzp1 v23.8h, v23.8h, v17.8h\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.16b, v23.16b, v17.16b\n"
+      "bge 31f\n"
+      "tbz x9, #3, 26f\n"
+      "str d23, [x26], #0x8\n"
+      "tbz x9, #2, 24f\n"
+      "st1 { v23.s }[2], [x26], #0x4\n"
+      "tbz x9, #1, 23f\n"
+      "st1 { v23.h }[6], [x26], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[14], [x26]\n"
+      "b 30f\n"
+      "23:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[12], [x26]\n"
+      "b 30f\n"
+      "24:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 25f\n"
+      "st1 { v23.h }[4], [x26], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[10], [x26]\n"
+      "b 30f\n"
+      "25:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[8], [x26]\n"
+      "b 30f\n"
+      "26:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 28f\n"
+      "str s23, [x26], #0x4\n"
+      "tbz x9, #1, 27f\n"
+      "st1 { v23.h }[2], [x26], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[6], [x26]\n"
+      "b 30f\n"
+      "27:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[4], [x26]\n"
+      "b 30f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 29f\n"
+      "str h23, [x26], #0x2\n"
+      "tbz x9, #0, 30f\n"
+      "st1 { v23.b }[2], [x26]\n"
+      "b 30f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_1_0
+      "str b23, [x26, #0x0]\n"
+      "30:"  // Height 1: Partial direct writeback: Done
+      "b 32f\n"
+      "31:"  // Height 1: Full writeback
+      "str q23, [x26, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "32:"  // Height 1: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 2b\n"
+      "b 130f\n"
+      "33:"  // Height 2
+      "movi v11.4s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v15.16b, #0x1\n"
+      "mov x26, %x[output_ptr]\n"
+      "34:"  // Height 2: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "35:"  // Height 2: setup done
+      "mov x25, #0x0\n"
+      "36:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 37f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "cbnz x25, 38f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 38f\n"
+      "37:"  // Height 2: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19\n"
+      "38:"  // Height 2: input setup done
+      "cmp x24, #0x10\n"
+      "blt 43f\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "cmp x24, #0x20\n"
+      "blt 41f\n"
+      "39:"  // Height 2: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q5, [x28, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "ldr q8, [x28, #0x30]\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      "ldr q9, [x28, #0x40]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      "ldr q10, [x28, #0x50]\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 40f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      "40:"  // Height 2: Multiply loop: unique 5: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "sub x24, x24, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "cmp x24, #0x20\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "bge 39b\n"
+      "41:"  // Height 2: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q5, [x28, #0x0]\n"
+      "sub x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      "ldr q8, [x28, #0x30]\n"
+      "ldr q9, [x28, #0x40]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 42f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      "42:"  // Height 2: Multiply loop: unique 6: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "43:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x24, 52f\n"
+      "cmp x24, #0x8\n"
+      "blt 46f\n"
+      "44:"  // Height 2: Multiply loop: Odd block loop
+      "ldr d1, [x23], #0x8\n"
+      "ldr d2, [x22], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "tbnz %x[flags], #31, 45f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "45:"  // Height 2: Multiply loop: unique 7: skip row sum
+      "ldr q8, [x28, #0x0]\n"
+      ".inst 0x6e88a410  // ummla v16.4s, v0.16b, v8.16b\n"
+      "ldr q9, [x28, #0x10]\n"
+      "sub x24, x24, #0x8\n"
+      ".inst 0x6e89a414  // ummla v20.4s, v0.16b, v9.16b\n"
+      "ldr q10, [x28, #0x20]\n"
+      "cmp x24, #0x8\n"
+      ".inst 0x6e8aa411  // ummla v17.4s, v0.16b, v10.16b\n"
+      "ldr q4, [x28, #0x30]\n"
+      "ldr q5, [x28, #0x40]\n"
+      ".inst 0x6e84a415  // ummla v21.4s, v0.16b, v4.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      "ldr q8, [x28, #0x70]\n"
+      ".inst 0x6e86a416  // ummla v22.4s, v0.16b, v6.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x6e87a413  // ummla v19.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e88a417  // ummla v23.4s, v0.16b, v8.16b\n"
+      "bge 44b\n"
+      "cbz x24, 52f\n"
+      "46:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x24, #2, 48f\n"
+      "ldr s1, [x23], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "tbz x24, #1, 47f\n"
+      "ld1 { v1.h }[2], [x23], #0x2\n"
+      "ld1 { v2.h }[2], [x22], #0x2\n"
+      "tbz x24, #0, 50f\n"
+      "ld1 { v1.b }[6], [x23]\n"
+      "ld1 { v2.b }[6], [x22]\n"
+      "b 50f\n"
+      "47:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x24, #0, 50f\n"
+      "ld1 { v1.b }[4], [x23]\n"
+      "ld1 { v2.b }[4], [x22]\n"
+      "b 50f\n"
+      "48:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x24, #1, 49f\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "tbz x24, #0, 50f\n"
+      "ld1 { v1.b }[2], [x23]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "b 50f\n"
+      "49:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x23, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "50:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "tbnz %x[flags], #31, 51f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      "51:"  // Height 2: Multiply loop: unique 8: skip row sum
+      "ldr q10, [x28, #0x0]\n"
+      ".inst 0x6e8aa410  // ummla v16.4s, v0.16b, v10.16b\n"
+      "ldr q4, [x28, #0x10]\n"
+      "ldr q5, [x28, #0x20]\n"
+      ".inst 0x6e84a414  // ummla v20.4s, v0.16b, v4.16b\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x6e85a411  // ummla v17.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      "ldr q8, [x28, #0x50]\n"
+      ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
+      "ldr q9, [x28, #0x60]\n"
+      "ldr q10, [x28, #0x70]\n"
+      ".inst 0x6e87a412  // ummla v18.4s, v0.16b, v7.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x6e88a416  // ummla v22.4s, v0.16b, v8.16b\n"
+      ".inst 0x6e89a413  // ummla v19.4s, v0.16b, v9.16b\n"
+      ".inst 0x6e8aa417  // ummla v23.4s, v0.16b, v10.16b\n"
+      "52:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 36b\n"
+      "uzp1 v4.2d, v16.2d, v20.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "add x21, x26, x19\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "mov v23.16b, v4.16b\n"
+      "tbnz %x[flags], #31, 53f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1r { v2.4s }, [x22]\n"
+      "dup v12.4s, v11.s[3]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "neg v2.4s, v2.4s\n"
+      "mul v11.4s, v11.4s, v2.4s\n"
+      "mul v12.4s, v12.4s, v2.4s\n"
+      "53:"  // Height 2: skip row sum fixup
+      "add v23.4s, v23.4s, v11.4s\n"
+      "ldr q0, [x27, #0x0]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v20.4s, v20.4s, v11.4s\n"
+      "ldr q1, [x27, #0x10]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add v21.4s, v21.4s, v11.4s\n"
+      "ldr q2, [x27, #0x20]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add v22.4s, v22.4s, v11.4s\n"
+      "ldr q3, [x27, #0x30]\n"
+      "add x27, x27, #0x40\n"
+      "add v16.4s, v16.4s, v12.4s\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "add v17.4s, v17.4s, v12.4s\n"
+      "add v18.4s, v18.4s, v12.4s\n"
+      "add v19.4s, v19.4s, v12.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v1.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x23]\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "tbz %x[flags], #5, 54f\n"
+      "and v4.16b, v23.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v6.16b, v21.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v23.4s, v23.4s, v4.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v6.4s\n"
+      "and v7.16b, v22.16b, v0.16b\n"
+      "and v8.16b, v16.16b, v0.16b\n"
+      "and v9.16b, v17.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v22.4s, v22.4s, v7.4s\n"
+      "sqadd v16.4s, v16.4s, v8.4s\n"
+      "sqadd v17.4s, v17.4s, v9.4s\n"
+      "and v10.16b, v18.16b, v0.16b\n"
+      "and v4.16b, v19.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v10.4s\n"
+      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "54:"  // Height 2: no shift correction
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "add x22, %x[qp], %[minval]\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x22]\n"
+      "cmp x9, #0x10\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "uzp1 v23.8h, v23.8h, v20.8h\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "uzp1 v23.16b, v23.16b, v20.16b\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "bge 63f\n"
+      "tbz x9, #3, 58f\n"
+      "str d23, [x26], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "tbz x9, #2, 56f\n"
+      "st1 { v23.s }[2], [x26], #0x4\n"
+      "st1 { v16.s }[2], [x21], #0x4\n"
+      "tbz x9, #1, 55f\n"
+      "st1 { v23.h }[6], [x26], #0x2\n"
+      "st1 { v16.h }[6], [x21], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[14], [x26]\n"
+      "st1 { v16.b }[14], [x21]\n"
+      "b 62f\n"
+      "55:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[12], [x26]\n"
+      "st1 { v16.b }[12], [x21]\n"
+      "b 62f\n"
+      "56:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 57f\n"
+      "st1 { v23.h }[4], [x26], #0x2\n"
+      "st1 { v16.h }[4], [x21], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[10], [x26]\n"
+      "st1 { v16.b }[10], [x21]\n"
+      "b 62f\n"
+      "57:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[8], [x26]\n"
+      "st1 { v16.b }[8], [x21]\n"
+      "b 62f\n"
+      "58:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 60f\n"
+      "str s23, [x26], #0x4\n"
+      "str s16, [x21], #0x4\n"
+      "tbz x9, #1, 59f\n"
+      "st1 { v23.h }[2], [x26], #0x2\n"
+      "st1 { v16.h }[2], [x21], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[6], [x26]\n"
+      "st1 { v16.b }[6], [x21]\n"
+      "b 62f\n"
+      "59:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[4], [x26]\n"
+      "st1 { v16.b }[4], [x21]\n"
+      "b 62f\n"
+      "60:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 61f\n"
+      "str h23, [x26], #0x2\n"
+      "str h16, [x21], #0x2\n"
+      "tbz x9, #0, 62f\n"
+      "st1 { v23.b }[2], [x26]\n"
+      "st1 { v16.b }[2], [x21]\n"
+      "b 62f\n"
+      "61:"  // Height 2: Partial direct writeback: partial_1_0
+      "str b23, [x26, #0x0]\n"
+      "str b16, [x21, #0x0]\n"
+      "62:"  // Height 2: Partial direct writeback: Done
+      "b 64f\n"
+      "63:"  // Height 2: Full writeback
+      "str q23, [x26, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "str q16, [x21, #0x0]\n"
+      "64:"  // Height 2: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 34b\n"
+      "b 130f\n"
+      "65:"  // Height 3
+      "movi v11.4s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "mov x26, %x[output_ptr]\n"
+      "movi v15.16b, #0x1\n"
+      "66:"  // Height 3: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "67:"  // Height 3: setup done
+      "mov x25, #0x0\n"
+      "68:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 69f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "cbnz x25, 70f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "b 70f\n"
+      "69:"  // Height 3: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "70:"  // Height 3: input setup done
+      "cmp x24, #0x10\n"
+      "blt 75f\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "cmp x24, #0x20\n"
+      "blt 73f\n"
+      "71:"  // Height 3: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x21, #0x0]\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q5, [x28, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x28, #0x20]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "ldr q8, [x28, #0x30]\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45c  // ummla v28.4s, v2.16b, v6.16b\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x6e88a45d  // ummla v29.4s, v2.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      ".inst 0x6e89a45a  // ummla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x6e8aa45e  // ummla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45b  // ummla v27.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45f  // ummla v31.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a478  // ummla v24.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a47c  // ummla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e88a479  // ummla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x6e89a47d  // ummla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x6e8aa47a  // ummla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a47e  // ummla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e85a47b  // ummla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a47f  // ummla v31.4s, v3.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 72f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f946d  // udot v13.4s, v3.16b, v15.16b\n"
+      "72:"  // Height 3: Multiply loop: unique 9: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "sub x24, x24, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "cmp x24, #0x20\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "bge 71b\n"
+      "73:"  // Height 3: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x21, #0x0]\n"
+      "sub x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q5, [x28, #0x0]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x28, #0x20]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "ldr q8, [x28, #0x30]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      "ldr q9, [x28, #0x40]\n"
+      "ldr q10, [x28, #0x50]\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45c  // ummla v28.4s, v2.16b, v6.16b\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x6e88a45d  // ummla v29.4s, v2.16b, v8.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      ".inst 0x6e89a45a  // ummla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x6e8aa45e  // ummla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45b  // ummla v27.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45f  // ummla v31.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a478  // ummla v24.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a47c  // ummla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e88a479  // ummla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x6e89a47d  // ummla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x6e8aa47a  // ummla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a47e  // ummla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e85a47b  // ummla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a47f  // ummla v31.4s, v3.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 74f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f946d  // udot v13.4s, v3.16b, v15.16b\n"
+      "74:"  // Height 3: Multiply loop: unique 10: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "75:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x24, 84f\n"
+      "cmp x24, #0x8\n"
+      "blt 78f\n"
+      "76:"  // Height 3: Multiply loop: Odd block loop
+      "movi v7.16b, #0x0\n"
+      "ldr d1, [x23], #0x8\n"
+      "ldr d2, [x22], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x21], #0x8\n"
+      "trn1 v2.2d, v3.2d, v7.2d\n"
+      "tbnz %x[flags], #31, 77f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "77:"  // Height 3: Multiply loop: unique 11: skip row sum
+      "ldr q8, [x28, #0x0]\n"
+      ".inst 0x6e88a410  // ummla v16.4s, v0.16b, v8.16b\n"
+      "ldr q9, [x28, #0x10]\n"
+      "sub x24, x24, #0x8\n"
+      ".inst 0x6e88a458  // ummla v24.4s, v2.16b, v8.16b\n"
+      "ldr q10, [x28, #0x20]\n"
+      "cmp x24, #0x8\n"
+      ".inst 0x6e89a414  // ummla v20.4s, v0.16b, v9.16b\n"
+      "ldr q4, [x28, #0x30]\n"
+      ".inst 0x6e89a45c  // ummla v28.4s, v2.16b, v9.16b\n"
+      "ldr q5, [x28, #0x40]\n"
+      ".inst 0x6e8aa411  // ummla v17.4s, v0.16b, v10.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x6e8aa459  // ummla v25.4s, v2.16b, v10.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      "ldr q8, [x28, #0x70]\n"
+      ".inst 0x6e84a415  // ummla v21.4s, v0.16b, v4.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x6e84a45d  // ummla v29.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45a  // ummla v26.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e86a416  // ummla v22.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45e  // ummla v30.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a413  // ummla v19.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a45b  // ummla v27.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e88a417  // ummla v23.4s, v0.16b, v8.16b\n"
+      ".inst 0x6e88a45f  // ummla v31.4s, v2.16b, v8.16b\n"
+      "bge 76b\n"
+      "cbz x24, 84f\n"
+      "78:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x24, #2, 80f\n"
+      "ldr s1, [x23], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "ldr s3, [x21], #0x4\n"
+      "tbz x24, #1, 79f\n"
+      "ld1 { v1.h }[2], [x23], #0x2\n"
+      "ld1 { v2.h }[2], [x22], #0x2\n"
+      "ld1 { v3.h }[2], [x21], #0x2\n"
+      "tbz x24, #0, 82f\n"
+      "ld1 { v1.b }[6], [x23]\n"
+      "ld1 { v2.b }[6], [x22]\n"
+      "ld1 { v3.b }[6], [x21]\n"
+      "b 82f\n"
+      "79:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x24, #0, 82f\n"
+      "ld1 { v1.b }[4], [x23]\n"
+      "ld1 { v2.b }[4], [x22]\n"
+      "ld1 { v3.b }[4], [x21]\n"
+      "b 82f\n"
+      "80:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x24, #1, 81f\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "ldr h3, [x21], #0x2\n"
+      "tbz x24, #0, 82f\n"
+      "ld1 { v1.b }[2], [x23]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "ld1 { v3.b }[2], [x21]\n"
+      "b 82f\n"
+      "81:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x23, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "ldr b3, [x21, #0x0]\n"
+      "82:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "movi v9.16b, #0x0\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v2.2d, v3.2d, v9.2d\n"
+      "tbnz %x[flags], #31, 83f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "83:"  // Height 3: Multiply loop: unique 12: skip row sum
+      "ldr q10, [x28, #0x0]\n"
+      ".inst 0x6e8aa410  // ummla v16.4s, v0.16b, v10.16b\n"
+      "ldr q4, [x28, #0x10]\n"
+      ".inst 0x6e8aa458  // ummla v24.4s, v2.16b, v10.16b\n"
+      "ldr q5, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x6e84a414  // ummla v20.4s, v0.16b, v4.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x6e84a45c  // ummla v28.4s, v2.16b, v4.16b\n"
+      "ldr q8, [x28, #0x50]\n"
+      ".inst 0x6e85a411  // ummla v17.4s, v0.16b, v5.16b\n"
+      "ldr q9, [x28, #0x60]\n"
+      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
+      "ldr q10, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45d  // ummla v29.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a412  // ummla v18.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a45a  // ummla v26.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e88a416  // ummla v22.4s, v0.16b, v8.16b\n"
+      ".inst 0x6e88a45e  // ummla v30.4s, v2.16b, v8.16b\n"
+      ".inst 0x6e89a413  // ummla v19.4s, v0.16b, v9.16b\n"
+      ".inst 0x6e89a45b  // ummla v27.4s, v2.16b, v9.16b\n"
+      ".inst 0x6e8aa417  // ummla v23.4s, v0.16b, v10.16b\n"
+      ".inst 0x6e8aa45f  // ummla v31.4s, v2.16b, v10.16b\n"
+      "84:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 68b\n"
+      "uzp1 v4.2d, v16.2d, v20.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "add x21, x26, x19\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "add x20, x21, x19\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "mov v31.16b, v4.16b\n"
+      "tbnz %x[flags], #31, 85f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1r { v3.4s }, [x22]\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "dup v12.4s, v11.s[3]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "neg v3.4s, v3.4s\n"
+      "dup v13.4s, v13.s[0]\n"
+      "mul v11.4s, v11.4s, v3.4s\n"
+      "mul v12.4s, v12.4s, v3.4s\n"
+      "mul v13.4s, v13.4s, v3.4s\n"
+      "85:"  // Height 3: skip row sum fixup
+      "add v31.4s, v31.4s, v11.4s\n"
+      "ldr q0, [x27, #0x0]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v20.4s, v20.4s, v11.4s\n"
+      "ldr q1, [x27, #0x10]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add v21.4s, v21.4s, v11.4s\n"
+      "ldr q2, [x27, #0x20]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add v22.4s, v22.4s, v11.4s\n"
+      "ldr q3, [x27, #0x30]\n"
+      "add x27, x27, #0x40\n"
+      "add v16.4s, v16.4s, v12.4s\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "add v17.4s, v17.4s, v12.4s\n"
+      "add v18.4s, v18.4s, v12.4s\n"
+      "add v19.4s, v19.4s, v12.4s\n"
+      "add v24.4s, v24.4s, v13.4s\n"
+      "add v25.4s, v25.4s, v13.4s\n"
+      "add v26.4s, v26.4s, v13.4s\n"
+      "add v27.4s, v27.4s, v13.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v1.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x23]\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "tbz %x[flags], #5, 86f\n"
+      "and v4.16b, v31.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v6.16b, v21.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v4.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v6.4s\n"
+      "and v7.16b, v22.16b, v0.16b\n"
+      "and v8.16b, v16.16b, v0.16b\n"
+      "and v9.16b, v17.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v22.4s, v22.4s, v7.4s\n"
+      "sqadd v16.4s, v16.4s, v8.4s\n"
+      "sqadd v17.4s, v17.4s, v9.4s\n"
+      "and v10.16b, v18.16b, v0.16b\n"
+      "and v4.16b, v19.16b, v0.16b\n"
+      "and v5.16b, v24.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v10.4s\n"
+      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "sqadd v24.4s, v24.4s, v5.4s\n"
+      "and v6.16b, v25.16b, v0.16b\n"
+      "and v7.16b, v26.16b, v0.16b\n"
+      "and v8.16b, v27.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v25.4s, v25.4s, v6.4s\n"
+      "sqadd v26.4s, v26.4s, v7.4s\n"
+      "sqadd v27.4s, v27.4s, v8.4s\n"
+      "86:"  // Height 3: no shift correction
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "add x22, %x[qp], %[minval]\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x22]\n"
+      "cmp x9, #0x10\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v31.4s, v31.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v31.4s, v31.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "uzp1 v31.8h, v31.8h, v20.8h\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "bge 95f\n"
+      "tbz x9, #3, 90f\n"
+      "str d31, [x26], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "str d24, [x20], #0x8\n"
+      "tbz x9, #2, 88f\n"
+      "st1 { v31.s }[2], [x26], #0x4\n"
+      "st1 { v16.s }[2], [x21], #0x4\n"
+      "st1 { v24.s }[2], [x20], #0x4\n"
+      "tbz x9, #1, 87f\n"
+      "st1 { v31.h }[6], [x26], #0x2\n"
+      "st1 { v16.h }[6], [x21], #0x2\n"
+      "st1 { v24.h }[6], [x20], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[14], [x26]\n"
+      "st1 { v16.b }[14], [x21]\n"
+      "st1 { v24.b }[14], [x20]\n"
+      "b 94f\n"
+      "87:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[12], [x26]\n"
+      "st1 { v16.b }[12], [x21]\n"
+      "st1 { v24.b }[12], [x20]\n"
+      "b 94f\n"
+      "88:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 89f\n"
+      "st1 { v31.h }[4], [x26], #0x2\n"
+      "st1 { v16.h }[4], [x21], #0x2\n"
+      "st1 { v24.h }[4], [x20], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[10], [x26]\n"
+      "st1 { v16.b }[10], [x21]\n"
+      "st1 { v24.b }[10], [x20]\n"
+      "b 94f\n"
+      "89:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[8], [x26]\n"
+      "st1 { v16.b }[8], [x21]\n"
+      "st1 { v24.b }[8], [x20]\n"
+      "b 94f\n"
+      "90:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 92f\n"
+      "str s31, [x26], #0x4\n"
+      "str s16, [x21], #0x4\n"
+      "str s24, [x20], #0x4\n"
+      "tbz x9, #1, 91f\n"
+      "st1 { v31.h }[2], [x26], #0x2\n"
+      "st1 { v16.h }[2], [x21], #0x2\n"
+      "st1 { v24.h }[2], [x20], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[6], [x26]\n"
+      "st1 { v16.b }[6], [x21]\n"
+      "st1 { v24.b }[6], [x20]\n"
+      "b 94f\n"
+      "91:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[4], [x26]\n"
+      "st1 { v16.b }[4], [x21]\n"
+      "st1 { v24.b }[4], [x20]\n"
+      "b 94f\n"
+      "92:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 93f\n"
+      "str h31, [x26], #0x2\n"
+      "str h16, [x21], #0x2\n"
+      "str h24, [x20], #0x2\n"
+      "tbz x9, #0, 94f\n"
+      "st1 { v31.b }[2], [x26]\n"
+      "st1 { v16.b }[2], [x21]\n"
+      "st1 { v24.b }[2], [x20]\n"
+      "b 94f\n"
+      "93:"  // Height 3: Partial direct writeback: partial_1_0
+      "str b31, [x26, #0x0]\n"
+      "str b16, [x21, #0x0]\n"
+      "str b24, [x20, #0x0]\n"
+      "94:"  // Height 3: Partial direct writeback: Done
+      "b 96f\n"
+      "95:"  // Height 3: Full writeback
+      "str q31, [x26, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "str q16, [x21, #0x0]\n"
+      "str q24, [x20, #0x0]\n"
+      "96:"  // Height 3: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 66b\n"
+      "b 130f\n"
+      "97:"  // Height 4
+      "movi v11.4s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[col_bias]\n"
+      "movi v12.4s, #0x0\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "movi v13.4s, #0x0\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "movi v14.4s, #0x0\n"
+      "mov x19, #0x4\n"
+      "movi v15.16b, #0x1\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "98:"  // Height 4: Column loop
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "99:"  // Height 4: setup done
+      "mov x25, #0x0\n"
+      "100:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 101f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x25, 102f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "add x20, x20, x19\n"
+      "b 102f\n"
+      "101:"  // Height 4: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "add x20, x21, x19\n"
+      "102:"  // Height 4: input setup done
+      "cmp x24, #0x10\n"
+      "blt 107f\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "cmp x24, #0x20\n"
+      "blt 105f\n"
+      "103:"  // Height 4: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x21, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x20, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x28, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      "ldr q8, [x28, #0x30]\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      "ldr q9, [x28, #0x40]\n"
+      ".inst 0x6e86a45c  // ummla v28.4s, v2.16b, v6.16b\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x6e88a45d  // ummla v29.4s, v2.16b, v8.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x6e89a45a  // ummla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x6e8aa45e  // ummla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45b  // ummla v27.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45f  // ummla v31.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a478  // ummla v24.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a47c  // ummla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e88a479  // ummla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x6e89a47d  // ummla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x6e8aa47a  // ummla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a47e  // ummla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e85a47b  // ummla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a47f  // ummla v31.4s, v3.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 104f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f946d  // udot v13.4s, v3.16b, v15.16b\n"
+      "104:"  // Height 4: Multiply loop: unique 13: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "sub x24, x24, #0x10\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "cmp x24, #0x20\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "ldr q1, [x23, #0x0]\n"
+      "ldr q2, [x22, #0x0]\n"
+      "bge 103b\n"
+      "105:"  // Height 4: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x21, #0x0]\n"
+      "sub x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x20, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x28, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x28, #0x10]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6e85a410  // ummla v16.4s, v0.16b, v5.16b\n"
+      "ldr q7, [x28, #0x20]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      "ldr q8, [x28, #0x30]\n"
+      ".inst 0x6e86a414  // ummla v20.4s, v0.16b, v6.16b\n"
+      "ldr q9, [x28, #0x40]\n"
+      ".inst 0x6e86a45c  // ummla v28.4s, v2.16b, v6.16b\n"
+      "ldr q10, [x28, #0x50]\n"
+      "ldr q4, [x28, #0x60]\n"
+      ".inst 0x6e87a411  // ummla v17.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      "ldr q5, [x28, #0x70]\n"
+      ".inst 0x6e88a415  // ummla v21.4s, v0.16b, v8.16b\n"
+      "ldr q6, [x28, #0x80]\n"
+      ".inst 0x6e88a45d  // ummla v29.4s, v2.16b, v8.16b\n"
+      "ldr q7, [x28, #0x90]\n"
+      ".inst 0x6e89a412  // ummla v18.4s, v0.16b, v9.16b\n"
+      "ldr q8, [x28, #0xa0]\n"
+      ".inst 0x6e89a45a  // ummla v26.4s, v2.16b, v9.16b\n"
+      "ldr q9, [x28, #0xb0]\n"
+      ".inst 0x6e8aa416  // ummla v22.4s, v0.16b, v10.16b\n"
+      ".inst 0x6e8aa45e  // ummla v30.4s, v2.16b, v10.16b\n"
+      "ldr q10, [x28, #0xc0]\n"
+      ".inst 0x6e84a413  // ummla v19.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a45b  // ummla v27.4s, v2.16b, v4.16b\n"
+      "ldr q4, [x28, #0xd0]\n"
+      ".inst 0x6e85a417  // ummla v23.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45f  // ummla v31.4s, v2.16b, v5.16b\n"
+      "ldr q5, [x28, #0xe0]\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a478  // ummla v24.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x28, #0xf0]\n"
+      "add x28, x28, #0x100\n"
+      ".inst 0x6e87a434  // ummla v20.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a47c  // ummla v28.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e88a431  // ummla v17.4s, v1.16b, v8.16b\n"
+      ".inst 0x6e88a479  // ummla v25.4s, v3.16b, v8.16b\n"
+      ".inst 0x6e89a435  // ummla v21.4s, v1.16b, v9.16b\n"
+      ".inst 0x6e89a47d  // ummla v29.4s, v3.16b, v9.16b\n"
+      ".inst 0x6e8aa432  // ummla v18.4s, v1.16b, v10.16b\n"
+      ".inst 0x6e8aa47a  // ummla v26.4s, v3.16b, v10.16b\n"
+      ".inst 0x6e84a436  // ummla v22.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a47e  // ummla v30.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e85a47b  // ummla v27.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e86a437  // ummla v23.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a47f  // ummla v31.4s, v3.16b, v6.16b\n"
+      "tbnz %x[flags], #31, 106f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      ".inst 0x6e8f942b  // udot v11.4s, v1.16b, v15.16b\n"
+      ".inst 0x6e8f946d  // udot v13.4s, v3.16b, v15.16b\n"
+      "106:"  // Height 4: Multiply loop: unique 14: skip row sum
+      "prfm pldl1keep, [x23, #0x80]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "107:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x24, 116f\n"
+      "cmp x24, #0x8\n"
+      "blt 110f\n"
+      "108:"  // Height 4: Multiply loop: Odd block loop
+      "ldr d1, [x23], #0x8\n"
+      "ldr d2, [x22], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x21], #0x8\n"
+      "ldr d7, [x20], #0x8\n"
+      "trn1 v2.2d, v3.2d, v7.2d\n"
+      "tbnz %x[flags], #31, 109f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "109:"  // Height 4: Multiply loop: unique 15: skip row sum
+      "ldr q8, [x28, #0x0]\n"
+      ".inst 0x6e88a410  // ummla v16.4s, v0.16b, v8.16b\n"
+      "ldr q9, [x28, #0x10]\n"
+      "sub x24, x24, #0x8\n"
+      ".inst 0x6e88a458  // ummla v24.4s, v2.16b, v8.16b\n"
+      "ldr q10, [x28, #0x20]\n"
+      "cmp x24, #0x8\n"
+      ".inst 0x6e89a414  // ummla v20.4s, v0.16b, v9.16b\n"
+      "ldr q4, [x28, #0x30]\n"
+      ".inst 0x6e89a45c  // ummla v28.4s, v2.16b, v9.16b\n"
+      "ldr q5, [x28, #0x40]\n"
+      ".inst 0x6e8aa411  // ummla v17.4s, v0.16b, v10.16b\n"
+      "ldr q6, [x28, #0x50]\n"
+      ".inst 0x6e8aa459  // ummla v25.4s, v2.16b, v10.16b\n"
+      "ldr q7, [x28, #0x60]\n"
+      "ldr q8, [x28, #0x70]\n"
+      ".inst 0x6e84a415  // ummla v21.4s, v0.16b, v4.16b\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x6e84a45d  // ummla v29.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a412  // ummla v18.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a45a  // ummla v26.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e86a416  // ummla v22.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45e  // ummla v30.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a413  // ummla v19.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a45b  // ummla v27.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e88a417  // ummla v23.4s, v0.16b, v8.16b\n"
+      ".inst 0x6e88a45f  // ummla v31.4s, v2.16b, v8.16b\n"
+      "bge 108b\n"
+      "cbz x24, 116f\n"
+      "110:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x24, #2, 112f\n"
+      "ldr s1, [x23], #0x4\n"
+      "ldr s2, [x22], #0x4\n"
+      "ldr s3, [x21], #0x4\n"
+      "ldr s9, [x20], #0x4\n"
+      "tbz x24, #1, 111f\n"
+      "ld1 { v1.h }[2], [x23], #0x2\n"
+      "ld1 { v2.h }[2], [x22], #0x2\n"
+      "ld1 { v3.h }[2], [x21], #0x2\n"
+      "ld1 { v9.h }[2], [x20], #0x2\n"
+      "tbz x24, #0, 114f\n"
+      "ld1 { v1.b }[6], [x23]\n"
+      "ld1 { v2.b }[6], [x22]\n"
+      "ld1 { v3.b }[6], [x21]\n"
+      "ld1 { v9.b }[6], [x20]\n"
+      "b 114f\n"
+      "111:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x24, #0, 114f\n"
+      "ld1 { v1.b }[4], [x23]\n"
+      "ld1 { v2.b }[4], [x22]\n"
+      "ld1 { v3.b }[4], [x21]\n"
+      "ld1 { v9.b }[4], [x20]\n"
+      "b 114f\n"
+      "112:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x24, #1, 113f\n"
+      "ldr h1, [x23], #0x2\n"
+      "ldr h2, [x22], #0x2\n"
+      "ldr h3, [x21], #0x2\n"
+      "ldr h9, [x20], #0x2\n"
+      "tbz x24, #0, 114f\n"
+      "ld1 { v1.b }[2], [x23]\n"
+      "ld1 { v2.b }[2], [x22]\n"
+      "ld1 { v3.b }[2], [x21]\n"
+      "ld1 { v9.b }[2], [x20]\n"
+      "b 114f\n"
+      "113:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x23, #0x0]\n"
+      "ldr b2, [x22, #0x0]\n"
+      "ldr b3, [x21, #0x0]\n"
+      "ldr b9, [x20, #0x0]\n"
+      "114:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v2.2d, v3.2d, v9.2d\n"
+      "tbnz %x[flags], #31, 115f\n"
+      ".inst 0x6e8f940b  // udot v11.4s, v0.16b, v15.16b\n"
+      ".inst 0x6e8f944d  // udot v13.4s, v2.16b, v15.16b\n"
+      "115:"  // Height 4: Multiply loop: unique 16: skip row sum
+      "ldr q10, [x28, #0x0]\n"
+      ".inst 0x6e8aa410  // ummla v16.4s, v0.16b, v10.16b\n"
+      "ldr q4, [x28, #0x10]\n"
+      ".inst 0x6e8aa458  // ummla v24.4s, v2.16b, v10.16b\n"
+      "ldr q5, [x28, #0x20]\n"
+      "ldr q6, [x28, #0x30]\n"
+      ".inst 0x6e84a414  // ummla v20.4s, v0.16b, v4.16b\n"
+      "ldr q7, [x28, #0x40]\n"
+      ".inst 0x6e84a45c  // ummla v28.4s, v2.16b, v4.16b\n"
+      "ldr q8, [x28, #0x50]\n"
+      ".inst 0x6e85a411  // ummla v17.4s, v0.16b, v5.16b\n"
+      "ldr q9, [x28, #0x60]\n"
+      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
+      "ldr q10, [x28, #0x70]\n"
+      "add x28, x28, #0x80\n"
+      ".inst 0x6e86a415  // ummla v21.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a45d  // ummla v29.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a412  // ummla v18.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a45a  // ummla v26.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e88a416  // ummla v22.4s, v0.16b, v8.16b\n"
+      ".inst 0x6e88a45e  // ummla v30.4s, v2.16b, v8.16b\n"
+      ".inst 0x6e89a413  // ummla v19.4s, v0.16b, v9.16b\n"
+      ".inst 0x6e89a45b  // ummla v27.4s, v2.16b, v9.16b\n"
+      ".inst 0x6e8aa417  // ummla v23.4s, v0.16b, v10.16b\n"
+      ".inst 0x6e8aa45f  // ummla v31.4s, v2.16b, v10.16b\n"
+      "116:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 100b\n"
+      "uzp1 v4.2d, v16.2d, v20.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x26, #0x0]\n"
+      "add x21, x26, x19\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "add x20, x21, x19\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "add x19, x20, x19\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "prfm pstl1keep, [x19, #0x0]\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "mov v31.16b, v4.16b\n"
+      "tbnz %x[flags], #31, 117f\n"
+      "addp v11.4s, v11.4s, v11.4s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "addp v13.4s, v13.4s, v13.4s\n"
+      "dup v12.4s, v11.s[3]\n"
+      "dup v11.4s, v11.s[0]\n"
+      "neg v4.4s, v4.4s\n"
+      "dup v14.4s, v13.s[3]\n"
+      "dup v13.4s, v13.s[0]\n"
+      "mul v11.4s, v11.4s, v4.4s\n"
+      "mul v12.4s, v12.4s, v4.4s\n"
+      "mul v13.4s, v13.4s, v4.4s\n"
+      "mul v14.4s, v14.4s, v4.4s\n"
+      "117:"  // Height 4: skip row sum fixup
+      "add v31.4s, v31.4s, v11.4s\n"
+      "ldr q0, [x27, #0x0]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add v20.4s, v20.4s, v11.4s\n"
+      "ldr q1, [x27, #0x10]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add v21.4s, v21.4s, v11.4s\n"
+      "ldr q2, [x27, #0x20]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add v22.4s, v22.4s, v11.4s\n"
+      "ldr q3, [x27, #0x30]\n"
+      "add x27, x27, #0x40\n"
+      "add v16.4s, v16.4s, v12.4s\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "add v17.4s, v17.4s, v12.4s\n"
+      "add v18.4s, v18.4s, v12.4s\n"
+      "add v19.4s, v19.4s, v12.4s\n"
+      "add v23.4s, v23.4s, v13.4s\n"
+      "add v28.4s, v28.4s, v13.4s\n"
+      "add v29.4s, v29.4s, v13.4s\n"
+      "add v30.4s, v30.4s, v13.4s\n"
+      "add v24.4s, v24.4s, v14.4s\n"
+      "add v25.4s, v25.4s, v14.4s\n"
+      "add v26.4s, v26.4s, v14.4s\n"
+      "add v27.4s, v27.4s, v14.4s\n"
+      "add v31.4s, v31.4s, v0.4s\n"
+      "add v20.4s, v20.4s, v1.4s\n"
+      "add v21.4s, v21.4s, v2.4s\n"
+      "add v22.4s, v22.4s, v3.4s\n"
+      "add v16.4s, v16.4s, v0.4s\n"
+      "add v17.4s, v17.4s, v1.4s\n"
+      "add v18.4s, v18.4s, v2.4s\n"
+      "add v19.4s, v19.4s, v3.4s\n"
+      "add v23.4s, v23.4s, v0.4s\n"
+      "add v28.4s, v28.4s, v1.4s\n"
+      "add v29.4s, v29.4s, v2.4s\n"
+      "add v30.4s, v30.4s, v3.4s\n"
+      "add v24.4s, v24.4s, v0.4s\n"
+      "ld1r { v0.4s }, [x23]\n"
+      "add v25.4s, v25.4s, v1.4s\n"
+      "add v26.4s, v26.4s, v2.4s\n"
+      "add v27.4s, v27.4s, v3.4s\n"
+      "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+      "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+      "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+      "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+      "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+      "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+      "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+      "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+      "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+      "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+      "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+      "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+      "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+      "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+      "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+      "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+      "tbz %x[flags], #5, 118f\n"
+      "and v4.16b, v31.16b, v0.16b\n"
+      "and v5.16b, v20.16b, v0.16b\n"
+      "and v6.16b, v21.16b, v0.16b\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sqadd v31.4s, v31.4s, v4.4s\n"
+      "sqadd v20.4s, v20.4s, v5.4s\n"
+      "sqadd v21.4s, v21.4s, v6.4s\n"
+      "and v7.16b, v22.16b, v0.16b\n"
+      "and v8.16b, v16.16b, v0.16b\n"
+      "and v9.16b, v17.16b, v0.16b\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sqadd v22.4s, v22.4s, v7.4s\n"
+      "sqadd v16.4s, v16.4s, v8.4s\n"
+      "sqadd v17.4s, v17.4s, v9.4s\n"
+      "and v10.16b, v18.16b, v0.16b\n"
+      "and v4.16b, v19.16b, v0.16b\n"
+      "and v5.16b, v23.16b, v0.16b\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v18.4s, v18.4s, v10.4s\n"
+      "sqadd v19.4s, v19.4s, v4.4s\n"
+      "sqadd v23.4s, v23.4s, v5.4s\n"
+      "and v6.16b, v28.16b, v0.16b\n"
+      "and v7.16b, v29.16b, v0.16b\n"
+      "and v8.16b, v30.16b, v0.16b\n"
+      "sshr v6.4s, v6.4s, #0x1f\n"
+      "sshr v7.4s, v7.4s, #0x1f\n"
+      "sshr v8.4s, v8.4s, #0x1f\n"
+      "sqadd v28.4s, v28.4s, v6.4s\n"
+      "sqadd v29.4s, v29.4s, v7.4s\n"
+      "sqadd v30.4s, v30.4s, v8.4s\n"
+      "and v9.16b, v24.16b, v0.16b\n"
+      "and v10.16b, v25.16b, v0.16b\n"
+      "and v4.16b, v26.16b, v0.16b\n"
+      "sshr v9.4s, v9.4s, #0x1f\n"
+      "sshr v10.4s, v10.4s, #0x1f\n"
+      "sshr v4.4s, v4.4s, #0x1f\n"
+      "sqadd v24.4s, v24.4s, v9.4s\n"
+      "sqadd v25.4s, v25.4s, v10.4s\n"
+      "sqadd v26.4s, v26.4s, v4.4s\n"
+      "and v5.16b, v27.16b, v0.16b\n"
+      "sshr v5.4s, v5.4s, #0x1f\n"
+      "sqadd v27.4s, v27.4s, v5.4s\n"
+      "118:"  // Height 4: no shift correction
+      "srshl v31.4s, v31.4s, v0.4s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1r { v4.4s }, [x22]\n"
+      "srshl v20.4s, v20.4s, v0.4s\n"
+      "add x22, %x[qp], %[minval]\n"
+      "srshl v21.4s, v21.4s, v0.4s\n"
+      "ld1r { v5.4s }, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      "srshl v22.4s, v22.4s, v0.4s\n"
+      "ld1r { v6.4s }, [x22]\n"
+      "cmp x9, #0x10\n"
+      "srshl v16.4s, v16.4s, v0.4s\n"
+      "srshl v17.4s, v17.4s, v0.4s\n"
+      "add v31.4s, v31.4s, v4.4s\n"
+      "add v20.4s, v20.4s, v4.4s\n"
+      "add v21.4s, v21.4s, v4.4s\n"
+      "smin v31.4s, v31.4s, v6.4s\n"
+      "smin v20.4s, v20.4s, v6.4s\n"
+      "smin v21.4s, v21.4s, v6.4s\n"
+      "smax v31.4s, v31.4s, v5.4s\n"
+      "smax v20.4s, v20.4s, v5.4s\n"
+      "smax v21.4s, v21.4s, v5.4s\n"
+      "add v22.4s, v22.4s, v4.4s\n"
+      "add v16.4s, v16.4s, v4.4s\n"
+      "add v17.4s, v17.4s, v4.4s\n"
+      "smin v22.4s, v22.4s, v6.4s\n"
+      "smin v16.4s, v16.4s, v6.4s\n"
+      "smin v17.4s, v17.4s, v6.4s\n"
+      "smax v22.4s, v22.4s, v5.4s\n"
+      "smax v16.4s, v16.4s, v5.4s\n"
+      "smax v17.4s, v17.4s, v5.4s\n"
+      "srshl v18.4s, v18.4s, v0.4s\n"
+      "srshl v19.4s, v19.4s, v0.4s\n"
+      "srshl v23.4s, v23.4s, v0.4s\n"
+      "srshl v28.4s, v28.4s, v0.4s\n"
+      "add v18.4s, v18.4s, v4.4s\n"
+      "add v19.4s, v19.4s, v4.4s\n"
+      "add v23.4s, v23.4s, v4.4s\n"
+      "smin v18.4s, v18.4s, v6.4s\n"
+      "smin v19.4s, v19.4s, v6.4s\n"
+      "smin v23.4s, v23.4s, v6.4s\n"
+      "smax v18.4s, v18.4s, v5.4s\n"
+      "smax v19.4s, v19.4s, v5.4s\n"
+      "smax v23.4s, v23.4s, v5.4s\n"
+      "add v28.4s, v28.4s, v4.4s\n"
+      "srshl v29.4s, v29.4s, v0.4s\n"
+      "srshl v30.4s, v30.4s, v0.4s\n"
+      "smin v28.4s, v28.4s, v6.4s\n"
+      "srshl v24.4s, v24.4s, v0.4s\n"
+      "add v29.4s, v29.4s, v4.4s\n"
+      "smax v28.4s, v28.4s, v5.4s\n"
+      "add v30.4s, v30.4s, v4.4s\n"
+      "smin v29.4s, v29.4s, v6.4s\n"
+      "add v24.4s, v24.4s, v4.4s\n"
+      "smin v30.4s, v30.4s, v6.4s\n"
+      "smax v29.4s, v29.4s, v5.4s\n"
+      "smin v24.4s, v24.4s, v6.4s\n"
+      "smax v30.4s, v30.4s, v5.4s\n"
+      "srshl v25.4s, v25.4s, v0.4s\n"
+      "smax v24.4s, v24.4s, v5.4s\n"
+      "srshl v26.4s, v26.4s, v0.4s\n"
+      "srshl v27.4s, v27.4s, v0.4s\n"
+      "add v25.4s, v25.4s, v4.4s\n"
+      "uzp1 v31.8h, v31.8h, v20.8h\n"
+      "add v26.4s, v26.4s, v4.4s\n"
+      "smin v25.4s, v25.4s, v6.4s\n"
+      "add v27.4s, v27.4s, v4.4s\n"
+      "smin v26.4s, v26.4s, v6.4s\n"
+      "smax v25.4s, v25.4s, v5.4s\n"
+      "smin v27.4s, v27.4s, v6.4s\n"
+      "smax v26.4s, v26.4s, v5.4s\n"
+      "uzp1 v20.8h, v21.8h, v22.8h\n"
+      "smax v27.4s, v27.4s, v5.4s\n"
+      "uzp1 v16.8h, v16.8h, v17.8h\n"
+      "uzp1 v17.8h, v18.8h, v19.8h\n"
+      "uzp1 v23.8h, v23.8h, v28.8h\n"
+      "uzp1 v28.8h, v29.8h, v30.8h\n"
+      "uzp1 v24.8h, v24.8h, v25.8h\n"
+      "uzp1 v25.8h, v26.8h, v27.8h\n"
+      "uzp1 v31.16b, v31.16b, v20.16b\n"
+      "uzp1 v16.16b, v16.16b, v17.16b\n"
+      "uzp1 v23.16b, v23.16b, v28.16b\n"
+      "uzp1 v24.16b, v24.16b, v25.16b\n"
+      "bge 127f\n"
+      "tbz x9, #3, 122f\n"
+      "str d31, [x26], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "str d23, [x20], #0x8\n"
+      "str d24, [x19], #0x8\n"
+      "tbz x9, #2, 120f\n"
+      "st1 { v31.s }[2], [x26], #0x4\n"
+      "st1 { v16.s }[2], [x21], #0x4\n"
+      "st1 { v23.s }[2], [x20], #0x4\n"
+      "st1 { v24.s }[2], [x19], #0x4\n"
+      "tbz x9, #1, 119f\n"
+      "st1 { v31.h }[6], [x26], #0x2\n"
+      "st1 { v16.h }[6], [x21], #0x2\n"
+      "st1 { v23.h }[6], [x20], #0x2\n"
+      "st1 { v24.h }[6], [x19], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[14], [x26]\n"
+      "st1 { v16.b }[14], [x21]\n"
+      "st1 { v23.b }[14], [x20]\n"
+      "st1 { v24.b }[14], [x19]\n"
+      "b 126f\n"
+      "119:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[12], [x26]\n"
+      "st1 { v16.b }[12], [x21]\n"
+      "st1 { v23.b }[12], [x20]\n"
+      "st1 { v24.b }[12], [x19]\n"
+      "b 126f\n"
+      "120:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x9, #1, 121f\n"
+      "st1 { v31.h }[4], [x26], #0x2\n"
+      "st1 { v16.h }[4], [x21], #0x2\n"
+      "st1 { v23.h }[4], [x20], #0x2\n"
+      "st1 { v24.h }[4], [x19], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[10], [x26]\n"
+      "st1 { v16.b }[10], [x21]\n"
+      "st1 { v23.b }[10], [x20]\n"
+      "st1 { v24.b }[10], [x19]\n"
+      "b 126f\n"
+      "121:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[8], [x26]\n"
+      "st1 { v16.b }[8], [x21]\n"
+      "st1 { v23.b }[8], [x20]\n"
+      "st1 { v24.b }[8], [x19]\n"
+      "b 126f\n"
+      "122:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x9, #2, 124f\n"
+      "str s31, [x26], #0x4\n"
+      "str s16, [x21], #0x4\n"
+      "str s23, [x20], #0x4\n"
+      "str s24, [x19], #0x4\n"
+      "tbz x9, #1, 123f\n"
+      "st1 { v31.h }[2], [x26], #0x2\n"
+      "st1 { v16.h }[2], [x21], #0x2\n"
+      "st1 { v23.h }[2], [x20], #0x2\n"
+      "st1 { v24.h }[2], [x19], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[6], [x26]\n"
+      "st1 { v16.b }[6], [x21]\n"
+      "st1 { v23.b }[6], [x20]\n"
+      "st1 { v24.b }[6], [x19]\n"
+      "b 126f\n"
+      "123:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[4], [x26]\n"
+      "st1 { v16.b }[4], [x21]\n"
+      "st1 { v23.b }[4], [x20]\n"
+      "st1 { v24.b }[4], [x19]\n"
+      "b 126f\n"
+      "124:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x9, #1, 125f\n"
+      "str h31, [x26], #0x2\n"
+      "str h16, [x21], #0x2\n"
+      "str h23, [x20], #0x2\n"
+      "str h24, [x19], #0x2\n"
+      "tbz x9, #0, 126f\n"
+      "st1 { v31.b }[2], [x26]\n"
+      "st1 { v16.b }[2], [x21]\n"
+      "st1 { v23.b }[2], [x20]\n"
+      "st1 { v24.b }[2], [x19]\n"
+      "b 126f\n"
+      "125:"  // Height 4: Partial direct writeback: partial_1_0
+      "str b31, [x26, #0x0]\n"
+      "str b16, [x21, #0x0]\n"
+      "str b23, [x20, #0x0]\n"
+      "str b24, [x19, #0x0]\n"
+      "126:"  // Height 4: Partial direct writeback: Done
+      "b 128f\n"
+      "127:"  // Height 4: Full writeback
+      "str q31, [x26, #0x0]\n"
+      "add x26, x26, #0x10\n"
+      "str q16, [x21, #0x0]\n"
+      "str q23, [x20, #0x0]\n"
+      "str q24, [x19, #0x0]\n"
+      "128:"  // Height 4: Writeback done
+      "subs x9, x9, #0x10\n"
+      "bgt 98b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 130f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 129f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "129:"  // Update direct input
+      "mov x19, #0x4\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "130:"  // Exit
+
+      : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
index da07fc17a1..3a77397632 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -22,8 +22,8 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef __aarch64__
 
+#ifdef __aarch64__
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
@@ -44,7 +44,8 @@ void a64_hybrid_u8u32_dot_6x16_a55( ARGLIST );
 class cls_a64_hybrid_u8u32_dot_6x16
 {
 public:
-    typedef uint8_t operand_type;
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
     typedef uint32_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -70,16 +71,35 @@ public:
         return true;
     }
 
-    StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
-
-    static PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
     {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 12.667, 2.0799, 0.2279 };
-            default:
-                return { 29.6736, 11.4025, 0.5591 };
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 9.5238, 2.0799, 0.2279 };
+                default:
+                    return { 29.6736, 11.4025, 0.5591 };
+                case CPUModel::A510:
+                    return { 16.65, 3.92, 0.48 };
+                case CPUModel::V1:
+                    return { 55.42, 19.29, 0.92 };
+            }
         }
+
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.63 };
+                case CPUModel::A510:
+                    return { 15.89 };
+                case CPUModel::V1:
+                    return { 53.87 };
+            }
+        }
+
+        return { 1.0 };
     }
 
     // Default to the generic kernel
@@ -99,4 +119,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
index ba57ad493a..ab0c88a3b2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
@@ -1819,8 +1819,8 @@ void a64_hybrid_u8u32_dot_6x16 (
       "ld1 { v22.4s }, [x21], #0x10\n"
       "ld1 { v26.4s }, [x20], #0x10\n"
       "tbz x10, #1, 139f\n"
-      "mov x24, #0x38\n"
       "ldr d11, [x28], #0x8\n"
+      "mov x24, #0x38\n"
       "ldr d15, [x23], #0x8\n"
       "ldr d19, [x22], #0x8\n"
       "ldr d23, [x21], #0x8\n"
@@ -1873,8 +1873,8 @@ void a64_hybrid_u8u32_dot_6x16 (
       "ld1 { v20.4s }, [x21], #0x10\n"
       "ld1 { v24.4s }, [x20], #0x10\n"
       "tbz x10, #1, 143f\n"
-      "mov x24, #0x18\n"
       "ldr d9, [x28], #0x8\n"
+      "mov x24, #0x18\n"
       "ldr d13, [x23], #0x8\n"
       "ldr d17, [x22], #0x8\n"
       "ldr d21, [x21], #0x8\n"
@@ -2487,12 +2487,12 @@ void a64_hybrid_u8u32_dot_6x16 (
       "ld1 { v16.4s }, [x22], #0x10\n"
       "ld1 { v20.4s }, [x21], #0x10\n"
       "ld1 { v24.4s }, [x20], #0x10\n"
-      "ld1 { v28.4s }, [x19], #0x10\n"
       "ld1 { v9.4s }, [x28], #0x10\n"
       "ld1 { v13.4s }, [x23], #0x10\n"
       "ld1 { v17.4s }, [x22], #0x10\n"
       "ld1 { v21.4s }, [x21], #0x10\n"
       "ld1 { v25.4s }, [x20], #0x10\n"
+      "ld1 { v28.4s }, [x19], #0x10\n"
       "ld1 { v29.4s }, [x19], #0x10\n"
       "tbz x10, #2, 174f\n"
       "ld1 { v10.4s }, [x28], #0x10\n"
@@ -2502,8 +2502,8 @@ void a64_hybrid_u8u32_dot_6x16 (
       "ld1 { v26.4s }, [x20], #0x10\n"
       "ld1 { v30.4s }, [x19], #0x10\n"
       "tbz x10, #1, 173f\n"
-      "mov x24, #0x38\n"
       "ldr d11, [x28], #0x8\n"
+      "mov x24, #0x38\n"
       "ldr d15, [x23], #0x8\n"
       "ldr d19, [x22], #0x8\n"
       "ldr d23, [x21], #0x8\n"
@@ -2563,8 +2563,8 @@ void a64_hybrid_u8u32_dot_6x16 (
       "ld1 { v24.4s }, [x20], #0x10\n"
       "ld1 { v28.4s }, [x19], #0x10\n"
       "tbz x10, #1, 177f\n"
-      "mov x24, #0x18\n"
       "ldr d9, [x28], #0x8\n"
+      "mov x24, #0x18\n"
       "ldr d13, [x23], #0x8\n"
       "ldr d17, [x22], #0x8\n"
       "ldr d21, [x21], #0x8\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
new file mode 100644
index 0000000000..24bad3c63e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16.hpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<uint8_t>, \
+    size_t, size_t, \
+    const uint8_t *, \
+    IndirectOutputArg<uint32_t>, \
+    const uint32_t *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hybrid_u8u32_mmla_6x16( ARGLIST );
+
+class cls_a64_hybrid_u8u32_mmla_6x16
+{
+public:
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsFixed<rhs_operand_type, result_type, 6, 16, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 55.05 };
+                case CPUModel::A510:
+                    return { 30.34 };
+                case CPUModel::V1:
+                    return { 83.77 };
+            }
+        }
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 55.31, 15.72, 0.62 };
+                case CPUModel::A510:
+                    return { 33.64, 3.92, 0.48 };
+                case CPUModel::V1:
+                    return { 86.71, 19.00, 0.93 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_u8u32_mmla_6x16;
+    cls_a64_hybrid_u8u32_mmla_6x16(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp
new file mode 100644
index 0000000000..fabb3f3efd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp
@@ -0,0 +1,3463 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8u32_mmla_6x16 (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+    const uint32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 186f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 149f\n"
+      "beq 112f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 75f\n"
+      "beq 38f\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "tbz %x[flags], #0, 13f\n"
+      "cmp x10, #0x10\n"
+      "bge 11f\n"
+      "tbz x10, #3, 6f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "tbz x10, #2, 4f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "tbz x10, #1, 3f\n"
+      "mov x24, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "tbz x10, #0, 10f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "b 10f\n"
+      "3:"  // Height 1: Partial accumulate: partial_1_12
+      "mov x24, #0x30\n"
+      "tbz x10, #0, 10f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "b 10f\n"
+      "4:"  // Height 1: Partial accumulate: partial_2_8
+      "tbz x10, #1, 5f\n"
+      "ldr d11, [x28], #0x8\n"
+      "mov x24, #0x28\n"
+      "tbz x10, #0, 10f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "b 10f\n"
+      "5:"  // Height 1: Partial accumulate: partial_1_8
+      "mov x24, #0x20\n"
+      "tbz x10, #0, 10f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "b 10f\n"
+      "6:"  // Height 1: Partial accumulate: partial_4_0
+      "tbz x10, #2, 8f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "tbz x10, #1, 7f\n"
+      "ldr d10, [x28], #0x8\n"
+      "mov x24, #0x18\n"
+      "tbz x10, #0, 10f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "b 10f\n"
+      "7:"  // Height 1: Partial accumulate: partial_1_4
+      "mov x24, #0x10\n"
+      "tbz x10, #0, 10f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "b 10f\n"
+      "8:"  // Height 1: Partial accumulate: partial_2_0
+      "tbz x10, #1, 9f\n"
+      "ldr d9, [x28], #0x8\n"
+      "mov x24, #0x8\n"
+      "tbz x10, #0, 10f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "b 10f\n"
+      "9:"  // Height 1: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x24, #0x0\n"
+      "10:"  // Height 1: Partial accumulate: Done
+      "sub x28, x28, x24\n"
+      "b 12f\n"
+      "11:"  // Height 1: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "12:"  // Height 1: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 14f\n"
+      "13:"  // Height 1: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "14:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "15:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 16f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 17f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "b 17f\n"
+      "16:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "17:"  // Height 1: input setup done
+      "cmp x26, #0x10\n"
+      "blt 20f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 19f\n"
+      "18:"  // Height 1: Multiply loop: Main loop head
+      "movi v2.16b, #0x0\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      "bge 18b\n"
+      "19:"  // Height 1: Multiply loop: Single iteration only
+      "movi v2.16b, #0x0\n"
+      "ldr q7, [x9, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      "20:"  // Height 1: Multiply loop: Main loop skip
+      "cbz x26, 27f\n"
+      "cmp x26, #0x8\n"
+      "blt 22f\n"
+      "21:"  // Height 1: Multiply loop: Odd block loop
+      "movi v2.16b, #0x0\n"
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x0]\n"
+      "cmp x26, #0x8\n"
+      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
+      "ldr q7, [x9, #0x10]\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x30]\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x6e87a40e  // ummla v14.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x6e86a40b  // ummla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a40f  // ummla v15.4s, v0.16b, v7.16b\n"
+      "bge 21b\n"
+      "cbz x26, 27f\n"
+      "22:"  // Height 1: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 24f\n"
+      "ldr s1, [x25], #0x4\n"
+      "tbz x26, #1, 23f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "tbz x26, #0, 26f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "b 26f\n"
+      "23:"  // Height 1: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 26f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "b 26f\n"
+      "24:"  // Height 1: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 25f\n"
+      "ldr h1, [x25], #0x2\n"
+      "tbz x26, #0, 26f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "b 26f\n"
+      "25:"  // Height 1: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "26:"  // Height 1: Multiply loop: Ragged operand read: Done
+      "movi v2.16b, #0x0\n"
+      "ldr q7, [x9, #0x0]\n"
+      "ldr q6, [x9, #0x10]\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      "27:"  // Height 1: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 15b\n"
+      "uzp1 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "uzp1 v9.2d, v9.2d, v13.2d\n"
+      "cmp x10, #0x10\n"
+      "uzp1 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v11.2d, v11.2d, v15.2d\n"
+      "bge 36f\n"
+      "tbz x10, #3, 31f\n"
+      "st1 { v8.4s }, [x28], #0x10\n"
+      "st1 { v9.4s }, [x28], #0x10\n"
+      "tbz x10, #2, 29f\n"
+      "st1 { v10.4s }, [x28], #0x10\n"
+      "tbz x10, #1, 28f\n"
+      "str d11, [x28], #0x8\n"
+      "tbz x10, #0, 35f\n"
+      "st1 { v11.s }[2], [x28]\n"
+      "b 35f\n"
+      "28:"  // Height 1: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 35f\n"
+      "str s11, [x28, #0x0]\n"
+      "b 35f\n"
+      "29:"  // Height 1: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 30f\n"
+      "str d10, [x28], #0x8\n"
+      "tbz x10, #0, 35f\n"
+      "st1 { v10.s }[2], [x28]\n"
+      "b 35f\n"
+      "30:"  // Height 1: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 35f\n"
+      "str s10, [x28, #0x0]\n"
+      "b 35f\n"
+      "31:"  // Height 1: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 33f\n"
+      "st1 { v8.4s }, [x28], #0x10\n"
+      "tbz x10, #1, 32f\n"
+      "str d9, [x28], #0x8\n"
+      "tbz x10, #0, 35f\n"
+      "st1 { v9.s }[2], [x28]\n"
+      "b 35f\n"
+      "32:"  // Height 1: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 35f\n"
+      "str s9, [x28, #0x0]\n"
+      "b 35f\n"
+      "33:"  // Height 1: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 34f\n"
+      "str d8, [x28], #0x8\n"
+      "tbz x10, #0, 35f\n"
+      "st1 { v8.s }[2], [x28]\n"
+      "b 35f\n"
+      "34:"  // Height 1: Partial direct writeback: partial_1_0
+      "str s8, [x28, #0x0]\n"
+      "35:"  // Height 1: Partial direct writeback: Done
+      "b 37f\n"
+      "36:"  // Height 1: Full writeback
+      "str q8, [x28, #0x0]\n"
+      "str q9, [x28, #0x10]\n"
+      "str q10, [x28, #0x20]\n"
+      "str q11, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "37:"  // Height 1: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 2b\n"
+      "b 224f\n"
+      "38:"  // Height 2
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "39:"  // Height 2: Column loop
+      "tbz %x[flags], #0, 50f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x10, #0x10\n"
+      "add x23, x28, x19, LSL #2\n"
+      "bge 48f\n"
+      "tbz x10, #3, 43f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x23], #0x10\n"
+      "tbz x10, #2, 41f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "tbz x10, #1, 40f\n"
+      "mov x24, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "tbz x10, #0, 47f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "b 47f\n"
+      "40:"  // Height 2: Partial accumulate: partial_1_12
+      "mov x24, #0x30\n"
+      "tbz x10, #0, 47f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "b 47f\n"
+      "41:"  // Height 2: Partial accumulate: partial_2_8
+      "tbz x10, #1, 42f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x24, #0x28\n"
+      "tbz x10, #0, 47f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "b 47f\n"
+      "42:"  // Height 2: Partial accumulate: partial_1_8
+      "mov x24, #0x20\n"
+      "tbz x10, #0, 47f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "b 47f\n"
+      "43:"  // Height 2: Partial accumulate: partial_4_0
+      "tbz x10, #2, 45f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "tbz x10, #1, 44f\n"
+      "mov x24, #0x18\n"
+      "ldr d10, [x28], #0x8\n"
+      "ldr d13, [x23], #0x8\n"
+      "tbz x10, #0, 47f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x23]\n"
+      "b 47f\n"
+      "44:"  // Height 2: Partial accumulate: partial_1_4
+      "mov x24, #0x10\n"
+      "tbz x10, #0, 47f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x23, #0x0]\n"
+      "b 47f\n"
+      "45:"  // Height 2: Partial accumulate: partial_2_0
+      "tbz x10, #1, 46f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x23], #0x8\n"
+      "mov x24, #0x8\n"
+      "tbz x10, #0, 47f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x23]\n"
+      "b 47f\n"
+      "46:"  // Height 2: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x24, #0x0\n"
+      "ldr s12, [x23, #0x0]\n"
+      "47:"  // Height 2: Partial accumulate: Done
+      "sub x28, x28, x24\n"
+      "b 49f\n"
+      "48:"  // Height 2: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x23, #0x0]\n"
+      "ldr q13, [x23, #0x10]\n"
+      "ldr q14, [x23, #0x20]\n"
+      "ldr q15, [x23, #0x30]\n"
+      "49:"  // Height 2: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "b 51f\n"
+      "50:"  // Height 2: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "51:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "52:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 53f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 54f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "b 54f\n"
+      "53:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "54:"  // Height 2: input setup done
+      "cmp x26, #0x10\n"
+      "blt 57f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 56f\n"
+      "55:"  // Height 2: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "sub x26, x26, #0x10\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      "bge 55b\n"
+      "56:"  // Height 2: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      "57:"  // Height 2: Multiply loop: Main loop skip
+      "cbz x26, 64f\n"
+      "cmp x26, #0x8\n"
+      "blt 59f\n"
+      "58:"  // Height 2: Multiply loop: Odd block loop
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x0]\n"
+      "cmp x26, #0x8\n"
+      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
+      "ldr q7, [x9, #0x10]\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x30]\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x6e87a40e  // ummla v14.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x6e86a40b  // ummla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a40f  // ummla v15.4s, v0.16b, v7.16b\n"
+      "bge 58b\n"
+      "cbz x26, 64f\n"
+      "59:"  // Height 2: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 61f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "tbz x26, #1, 60f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "tbz x26, #0, 63f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "b 63f\n"
+      "60:"  // Height 2: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 63f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "b 63f\n"
+      "61:"  // Height 2: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 62f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "tbz x26, #0, 63f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "b 63f\n"
+      "62:"  // Height 2: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "63:"  // Height 2: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      "64:"  // Height 2: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 52b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "cmp x10, #0x10\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "bge 73f\n"
+      "tbz x10, #3, 68f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "tbz x10, #2, 66f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "tbz x10, #1, 65f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "tbz x10, #0, 72f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "b 72f\n"
+      "65:"  // Height 2: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 72f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "b 72f\n"
+      "66:"  // Height 2: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 67f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "tbz x10, #0, 72f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "b 72f\n"
+      "67:"  // Height 2: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 72f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "b 72f\n"
+      "68:"  // Height 2: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 70f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "tbz x10, #1, 69f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "tbz x10, #0, 72f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "b 72f\n"
+      "69:"  // Height 2: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 72f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "b 72f\n"
+      "70:"  // Height 2: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 71f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "tbz x10, #0, 72f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "b 72f\n"
+      "71:"  // Height 2: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "72:"  // Height 2: Partial direct writeback: Done
+      "b 74f\n"
+      "73:"  // Height 2: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "74:"  // Height 2: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 39b\n"
+      "b 224f\n"
+      "75:"  // Height 3
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "76:"  // Height 3: Column loop
+      "tbz %x[flags], #0, 87f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x10, #0x10\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "bge 85f\n"
+      "tbz x10, #3, 80f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x23], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "tbz x10, #2, 78f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v19.4s }, [x22], #0x10\n"
+      "tbz x10, #1, 77f\n"
+      "mov x24, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "tbz x10, #0, 84f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "b 84f\n"
+      "77:"  // Height 3: Partial accumulate: partial_1_12
+      "mov x24, #0x30\n"
+      "tbz x10, #0, 84f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "b 84f\n"
+      "78:"  // Height 3: Partial accumulate: partial_2_8
+      "tbz x10, #1, 79f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x24, #0x28\n"
+      "ldr d19, [x22], #0x8\n"
+      "tbz x10, #0, 84f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "b 84f\n"
+      "79:"  // Height 3: Partial accumulate: partial_1_8
+      "mov x24, #0x20\n"
+      "tbz x10, #0, 84f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "b 84f\n"
+      "80:"  // Height 3: Partial accumulate: partial_4_0
+      "tbz x10, #2, 82f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "tbz x10, #1, 81f\n"
+      "mov x24, #0x18\n"
+      "ldr d10, [x28], #0x8\n"
+      "ldr d13, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "tbz x10, #0, 84f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x23]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "b 84f\n"
+      "81:"  // Height 3: Partial accumulate: partial_1_4
+      "mov x24, #0x10\n"
+      "tbz x10, #0, 84f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x23, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "b 84f\n"
+      "82:"  // Height 3: Partial accumulate: partial_2_0
+      "tbz x10, #1, 83f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x23], #0x8\n"
+      "mov x24, #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "tbz x10, #0, 84f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x23]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "b 84f\n"
+      "83:"  // Height 3: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x24, #0x0\n"
+      "ldr s12, [x23, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "84:"  // Height 3: Partial accumulate: Done
+      "sub x28, x28, x24\n"
+      "b 86f\n"
+      "85:"  // Height 3: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x23, #0x0]\n"
+      "ldr q13, [x23, #0x10]\n"
+      "ldr q14, [x23, #0x20]\n"
+      "ldr q15, [x23, #0x30]\n"
+      "ldr q17, [x22, #0x0]\n"
+      "ldr q18, [x22, #0x10]\n"
+      "ldr q19, [x22, #0x20]\n"
+      "ldr q24, [x22, #0x30]\n"
+      "86:"  // Height 3: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 88f\n"
+      "87:"  // Height 3: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "88:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "89:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 90f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 91f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "b 91f\n"
+      "90:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "91:"  // Height 3: input setup done
+      "cmp x26, #0x10\n"
+      "blt 94f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 93f\n"
+      "92:"  // Height 3: Multiply loop: Main loop head
+      "movi v4.16b, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      "bge 92b\n"
+      "93:"  // Height 3: Multiply loop: Single iteration only
+      "movi v4.16b, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      "94:"  // Height 3: Multiply loop: Main loop skip
+      "cbz x26, 101f\n"
+      "cmp x26, #0x8\n"
+      "blt 96f\n"
+      "95:"  // Height 3: Multiply loop: Odd block loop
+      "movi v4.16b, #0x0\n"
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
+      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a450  // ummla v16.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a454  // ummla v20.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x30]\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a451  // ummla v17.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a455  // ummla v21.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a452  // ummla v18.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x6e87a40e  // ummla v14.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a456  // ummla v22.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x6e86a40b  // ummla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a453  // ummla v19.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a40f  // ummla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
+      "bge 95b\n"
+      "cbz x26, 101f\n"
+      "96:"  // Height 3: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 98f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "tbz x26, #1, 97f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "ld1 { v3.h }[2], [x23], #0x2\n"
+      "tbz x26, #0, 100f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "ld1 { v3.b }[6], [x23]\n"
+      "b 100f\n"
+      "97:"  // Height 3: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 100f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "ld1 { v3.b }[4], [x23]\n"
+      "b 100f\n"
+      "98:"  // Height 3: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 99f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "tbz x26, #0, 100f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "b 100f\n"
+      "99:"  // Height 3: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "100:"  // Height 3: Multiply loop: Ragged operand read: Done
+      "movi v4.16b, #0x0\n"
+      "ldr q7, [x9, #0x0]\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      "101:"  // Height 3: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 89b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "cmp x10, #0x10\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v19.2d, v19.2d, v23.2d\n"
+      "bge 110f\n"
+      "tbz x10, #3, 105f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "st1 { v17.4s }, [x22], #0x10\n"
+      "tbz x10, #2, 103f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v18.4s }, [x22], #0x10\n"
+      "tbz x10, #1, 102f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "str d19, [x22], #0x8\n"
+      "tbz x10, #0, 109f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "st1 { v19.s }[2], [x22]\n"
+      "b 109f\n"
+      "102:"  // Height 3: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 109f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "str s19, [x22, #0x0]\n"
+      "b 109f\n"
+      "103:"  // Height 3: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 104f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "str d18, [x22], #0x8\n"
+      "tbz x10, #0, 109f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "st1 { v18.s }[2], [x22]\n"
+      "b 109f\n"
+      "104:"  // Height 3: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 109f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "str s18, [x22, #0x0]\n"
+      "b 109f\n"
+      "105:"  // Height 3: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 107f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v16.4s }, [x22], #0x10\n"
+      "tbz x10, #1, 106f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "str d17, [x22], #0x8\n"
+      "tbz x10, #0, 109f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "st1 { v17.s }[2], [x22]\n"
+      "b 109f\n"
+      "106:"  // Height 3: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 109f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "str s17, [x22, #0x0]\n"
+      "b 109f\n"
+      "107:"  // Height 3: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 108f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d16, [x22], #0x8\n"
+      "tbz x10, #0, 109f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "st1 { v16.s }[2], [x22]\n"
+      "b 109f\n"
+      "108:"  // Height 3: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "str s16, [x22, #0x0]\n"
+      "109:"  // Height 3: Partial direct writeback: Done
+      "b 111f\n"
+      "110:"  // Height 3: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q16, [x22, #0x0]\n"
+      "str q17, [x22, #0x10]\n"
+      "str q18, [x22, #0x20]\n"
+      "str q19, [x22, #0x30]\n"
+      "111:"  // Height 3: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 76b\n"
+      "b 224f\n"
+      "112:"  // Height 4
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "113:"  // Height 4: Column loop
+      "tbz %x[flags], #0, 124f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x10, #0x10\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "bge 122f\n"
+      "tbz x10, #3, 117f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x23], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "tbz x10, #2, 115f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v19.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "tbz x10, #1, 114f\n"
+      "mov x24, #0x38\n"
+      "ldr d16, [x28], #0x8\n"
+      "ldr d15, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "tbz x10, #0, 121f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v23.s }[2], [x21]\n"
+      "b 121f\n"
+      "114:"  // Height 4: Partial accumulate: partial_1_12
+      "mov x24, #0x30\n"
+      "tbz x10, #0, 121f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s23, [x21, #0x0]\n"
+      "b 121f\n"
+      "115:"  // Height 4: Partial accumulate: partial_2_8
+      "tbz x10, #1, 116f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x24, #0x28\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d22, [x21], #0x8\n"
+      "tbz x10, #0, 121f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "ld1 { v22.s }[2], [x21]\n"
+      "b 121f\n"
+      "116:"  // Height 4: Partial accumulate: partial_1_8
+      "mov x24, #0x20\n"
+      "tbz x10, #0, 121f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "ldr s22, [x21, #0x0]\n"
+      "b 121f\n"
+      "117:"  // Height 4: Partial accumulate: partial_4_0
+      "tbz x10, #2, 119f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "tbz x10, #1, 118f\n"
+      "mov x24, #0x18\n"
+      "ldr d10, [x28], #0x8\n"
+      "ldr d13, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
+      "tbz x10, #0, 121f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x23]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "ld1 { v21.s }[2], [x21]\n"
+      "b 121f\n"
+      "118:"  // Height 4: Partial accumulate: partial_1_4
+      "mov x24, #0x10\n"
+      "tbz x10, #0, 121f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x23, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "ldr s21, [x21, #0x0]\n"
+      "b 121f\n"
+      "119:"  // Height 4: Partial accumulate: partial_2_0
+      "tbz x10, #1, 120f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x23], #0x8\n"
+      "mov x24, #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d20, [x21], #0x8\n"
+      "tbz x10, #0, 121f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x23]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "ld1 { v20.s }[2], [x21]\n"
+      "b 121f\n"
+      "120:"  // Height 4: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x24, #0x0\n"
+      "ldr s12, [x23, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "ldr s20, [x21, #0x0]\n"
+      "121:"  // Height 4: Partial accumulate: Done
+      "sub x28, x28, x24\n"
+      "b 123f\n"
+      "122:"  // Height 4: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x23, #0x0]\n"
+      "ldr q13, [x23, #0x10]\n"
+      "ldr q14, [x23, #0x20]\n"
+      "ldr q15, [x23, #0x30]\n"
+      "ldr q17, [x22, #0x0]\n"
+      "ldr q18, [x22, #0x10]\n"
+      "ldr q19, [x22, #0x20]\n"
+      "ldr q24, [x22, #0x30]\n"
+      "ldr q20, [x21, #0x0]\n"
+      "ldr q21, [x21, #0x10]\n"
+      "ldr q22, [x21, #0x20]\n"
+      "ldr q23, [x21, #0x30]\n"
+      "123:"  // Height 4: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "b 125f\n"
+      "124:"  // Height 4: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "125:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "126:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 127f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 128f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 128f\n"
+      "127:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "128:"  // Height 4: input setup done
+      "cmp x26, #0x10\n"
+      "blt 131f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 130f\n"
+      "129:"  // Height 4: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "sub x26, x26, #0x10\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      "bge 129b\n"
+      "130:"  // Height 4: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      "131:"  // Height 4: Multiply loop: Main loop skip
+      "cbz x26, 138f\n"
+      "cmp x26, #0x8\n"
+      "blt 133f\n"
+      "132:"  // Height 4: Multiply loop: Odd block loop
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x8\n"
+      "ldr d4, [x22], #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
+      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a450  // ummla v16.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a454  // ummla v20.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x30]\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a451  // ummla v17.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a455  // ummla v21.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a452  // ummla v18.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x6e87a40e  // ummla v14.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a456  // ummla v22.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x6e86a40b  // ummla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a453  // ummla v19.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a40f  // ummla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
+      "bge 132b\n"
+      "cbz x26, 138f\n"
+      "133:"  // Height 4: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 135f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "tbz x26, #1, 134f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "ld1 { v3.h }[2], [x23], #0x2\n"
+      "ld1 { v4.h }[2], [x22], #0x2\n"
+      "tbz x26, #0, 137f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "ld1 { v3.b }[6], [x23]\n"
+      "ld1 { v4.b }[6], [x22]\n"
+      "b 137f\n"
+      "134:"  // Height 4: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 137f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "ld1 { v3.b }[4], [x23]\n"
+      "ld1 { v4.b }[4], [x22]\n"
+      "b 137f\n"
+      "135:"  // Height 4: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 136f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "tbz x26, #0, 137f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "b 137f\n"
+      "136:"  // Height 4: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "137:"  // Height 4: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      "138:"  // Height 4: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 126b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "cmp x10, #0x10\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "bge 147f\n"
+      "tbz x10, #3, 142f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x21], #0x10\n"
+      "st1 { v17.4s }, [x21], #0x10\n"
+      "tbz x10, #2, 140f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v18.4s }, [x21], #0x10\n"
+      "tbz x10, #1, 139f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d19, [x21], #0x8\n"
+      "tbz x10, #0, 146f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v19.s }[2], [x21]\n"
+      "b 146f\n"
+      "139:"  // Height 4: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 146f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s19, [x21, #0x0]\n"
+      "b 146f\n"
+      "140:"  // Height 4: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 141f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d18, [x21], #0x8\n"
+      "tbz x10, #0, 146f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v18.s }[2], [x21]\n"
+      "b 146f\n"
+      "141:"  // Height 4: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 146f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s18, [x21, #0x0]\n"
+      "b 146f\n"
+      "142:"  // Height 4: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 144f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x21], #0x10\n"
+      "tbz x10, #1, 143f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d17, [x21], #0x8\n"
+      "tbz x10, #0, 146f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v17.s }[2], [x21]\n"
+      "b 146f\n"
+      "143:"  // Height 4: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 146f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s17, [x21, #0x0]\n"
+      "b 146f\n"
+      "144:"  // Height 4: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 145f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d15, [x22], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "tbz x10, #0, 146f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "st1 { v15.s }[2], [x22]\n"
+      "st1 { v16.s }[2], [x21]\n"
+      "b 146f\n"
+      "145:"  // Height 4: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "str s15, [x22, #0x0]\n"
+      "str s16, [x21, #0x0]\n"
+      "146:"  // Height 4: Partial direct writeback: Done
+      "b 148f\n"
+      "147:"  // Height 4: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q15, [x22, #0x0]\n"
+      "str q20, [x22, #0x10]\n"
+      "str q21, [x22, #0x20]\n"
+      "str q22, [x22, #0x30]\n"
+      "str q16, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "str q18, [x21, #0x20]\n"
+      "str q19, [x21, #0x30]\n"
+      "148:"  // Height 4: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 113b\n"
+      "b 224f\n"
+      "149:"  // Height 5
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "150:"  // Height 5: Column loop
+      "tbz %x[flags], #0, 161f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x10, #0x10\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "bge 159f\n"
+      "tbz x10, #3, 154f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v25.4s }, [x20], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x23], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "ld1 { v26.4s }, [x20], #0x10\n"
+      "tbz x10, #2, 152f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v19.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "ld1 { v27.4s }, [x20], #0x10\n"
+      "tbz x10, #1, 151f\n"
+      "ldr d16, [x28], #0x8\n"
+      "mov x24, #0x38\n"
+      "ldr d15, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d6, [x20], #0x8\n"
+      "tbz x10, #0, 158f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v23.s }[2], [x21]\n"
+      "ld1 { v6.s }[2], [x20]\n"
+      "b 158f\n"
+      "151:"  // Height 5: Partial accumulate: partial_1_12
+      "mov x24, #0x30\n"
+      "tbz x10, #0, 158f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s23, [x21, #0x0]\n"
+      "ldr s6, [x20, #0x0]\n"
+      "b 158f\n"
+      "152:"  // Height 5: Partial accumulate: partial_2_8
+      "tbz x10, #1, 153f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x24, #0x28\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d22, [x21], #0x8\n"
+      "ldr d27, [x20], #0x8\n"
+      "tbz x10, #0, 158f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "ld1 { v22.s }[2], [x21]\n"
+      "ld1 { v27.s }[2], [x20]\n"
+      "b 158f\n"
+      "153:"  // Height 5: Partial accumulate: partial_1_8
+      "mov x24, #0x20\n"
+      "tbz x10, #0, 158f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "ldr s22, [x21, #0x0]\n"
+      "ldr s27, [x20, #0x0]\n"
+      "b 158f\n"
+      "154:"  // Height 5: Partial accumulate: partial_4_0
+      "tbz x10, #2, 156f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v25.4s }, [x20], #0x10\n"
+      "tbz x10, #1, 155f\n"
+      "ldr d10, [x28], #0x8\n"
+      "mov x24, #0x18\n"
+      "ldr d13, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
+      "ldr d26, [x20], #0x8\n"
+      "tbz x10, #0, 158f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x23]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "ld1 { v21.s }[2], [x21]\n"
+      "ld1 { v26.s }[2], [x20]\n"
+      "b 158f\n"
+      "155:"  // Height 5: Partial accumulate: partial_1_4
+      "mov x24, #0x10\n"
+      "tbz x10, #0, 158f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x23, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "ldr s21, [x21, #0x0]\n"
+      "ldr s26, [x20, #0x0]\n"
+      "b 158f\n"
+      "156:"  // Height 5: Partial accumulate: partial_2_0
+      "tbz x10, #1, 157f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x23], #0x8\n"
+      "mov x24, #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d20, [x21], #0x8\n"
+      "ldr d25, [x20], #0x8\n"
+      "tbz x10, #0, 158f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x23]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "ld1 { v20.s }[2], [x21]\n"
+      "ld1 { v25.s }[2], [x20]\n"
+      "b 158f\n"
+      "157:"  // Height 5: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x24, #0x0\n"
+      "ldr s12, [x23, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "ldr s20, [x21, #0x0]\n"
+      "ldr s25, [x20, #0x0]\n"
+      "158:"  // Height 5: Partial accumulate: Done
+      "sub x28, x28, x24\n"
+      "b 160f\n"
+      "159:"  // Height 5: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x23, #0x0]\n"
+      "ldr q13, [x23, #0x10]\n"
+      "ldr q14, [x23, #0x20]\n"
+      "ldr q15, [x23, #0x30]\n"
+      "ldr q17, [x22, #0x0]\n"
+      "ldr q18, [x22, #0x10]\n"
+      "ldr q19, [x22, #0x20]\n"
+      "ldr q24, [x22, #0x30]\n"
+      "ldr q20, [x21, #0x0]\n"
+      "ldr q21, [x21, #0x10]\n"
+      "ldr q22, [x21, #0x20]\n"
+      "ldr q23, [x21, #0x30]\n"
+      "ldr q25, [x20, #0x0]\n"
+      "ldr q26, [x20, #0x10]\n"
+      "ldr q27, [x20, #0x20]\n"
+      "ldr q6, [x20, #0x30]\n"
+      "160:"  // Height 5: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 162f\n"
+      "161:"  // Height 5: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "162:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "163:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 164f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 165f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "b 165f\n"
+      "164:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "165:"  // Height 5: input setup done
+      "cmp x26, #0x10\n"
+      "blt 168f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 167f\n"
+      "166:"  // Height 5: Multiply loop: Main loop head
+      "movi v6.4s, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e86a49c  // ummla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49d  // ummla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49e  // ummla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4b8  // ummla v24.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bc  // ummla v28.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4b9  // ummla v25.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bd  // ummla v29.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4ba  // ummla v26.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4be  // ummla v30.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4bb  // ummla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bf  // ummla v31.4s, v5.16b, v6.16b\n"
+      "bge 166b\n"
+      "167:"  // Height 5: Multiply loop: Single iteration only
+      "movi v6.4s, #0x0\n"
+      "ldr q2, [x24, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e86a49c  // ummla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49d  // ummla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49e  // ummla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4b8  // ummla v24.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bc  // ummla v28.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4b9  // ummla v25.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bd  // ummla v29.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4ba  // ummla v26.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4be  // ummla v30.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4bb  // ummla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bf  // ummla v31.4s, v5.16b, v6.16b\n"
+      "168:"  // Height 5: Multiply loop: Main loop skip
+      "cbz x26, 175f\n"
+      "cmp x26, #0x8\n"
+      "blt 170f\n"
+      "169:"  // Height 5: Multiply loop: Odd block loop
+      "movi v7.4s, #0x0\n"
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x8\n"
+      "ldr d4, [x22], #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d5, [x21], #0x8\n"
+      "ldr q6, [x9, #0x0]\n"
+      "trn1 v4.2d, v5.2d, v7.2d\n"
+      "ldr q7, [x9, #0x10]\n"
+      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a450  // ummla v16.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a498  // ummla v24.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a454  // ummla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49c  // ummla v28.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x30]\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a451  // ummla v17.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a499  // ummla v25.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a455  // ummla v21.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49d  // ummla v29.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a452  // ummla v18.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49a  // ummla v26.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x6e87a40e  // ummla v14.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a456  // ummla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49e  // ummla v30.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x6e86a40b  // ummla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a453  // ummla v19.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49b  // ummla v27.4s, v4.16b, v6.16b\n"
+      ".inst 0x6e87a40f  // ummla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49f  // ummla v31.4s, v4.16b, v7.16b\n"
+      "bge 169b\n"
+      "cbz x26, 175f\n"
+      "170:"  // Height 5: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 172f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x21], #0x4\n"
+      "tbz x26, #1, 171f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "ld1 { v3.h }[2], [x23], #0x2\n"
+      "ld1 { v4.h }[2], [x22], #0x2\n"
+      "ld1 { v5.h }[2], [x21], #0x2\n"
+      "tbz x26, #0, 174f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "ld1 { v3.b }[6], [x23]\n"
+      "ld1 { v4.b }[6], [x22]\n"
+      "ld1 { v5.b }[6], [x21]\n"
+      "b 174f\n"
+      "171:"  // Height 5: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 174f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "ld1 { v3.b }[4], [x23]\n"
+      "ld1 { v4.b }[4], [x22]\n"
+      "ld1 { v5.b }[4], [x21]\n"
+      "b 174f\n"
+      "172:"  // Height 5: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 173f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x21], #0x2\n"
+      "tbz x26, #0, 174f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x21]\n"
+      "b 174f\n"
+      "173:"  // Height 5: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x21, #0x0]\n"
+      "174:"  // Height 5: Multiply loop: Ragged operand read: Done
+      "movi v6.4s, #0x0\n"
+      "ldr q7, [x9, #0x0]\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49c  // ummla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49d  // ummla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49e  // ummla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
+      "175:"  // Height 5: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 163b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "cmp x10, #0x10\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v27.2d, v27.2d, v31.2d\n"
+      "bge 184f\n"
+      "tbz x10, #3, 179f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x21], #0x10\n"
+      "st1 { v17.4s }, [x21], #0x10\n"
+      "st1 { v24.4s }, [x20], #0x10\n"
+      "st1 { v25.4s }, [x20], #0x10\n"
+      "tbz x10, #2, 177f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v18.4s }, [x21], #0x10\n"
+      "st1 { v26.4s }, [x20], #0x10\n"
+      "tbz x10, #1, 176f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d19, [x21], #0x8\n"
+      "str d27, [x20], #0x8\n"
+      "tbz x10, #0, 183f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v19.s }[2], [x21]\n"
+      "st1 { v27.s }[2], [x20]\n"
+      "b 183f\n"
+      "176:"  // Height 5: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 183f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s19, [x21, #0x0]\n"
+      "str s27, [x20, #0x0]\n"
+      "b 183f\n"
+      "177:"  // Height 5: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 178f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d18, [x21], #0x8\n"
+      "str d26, [x20], #0x8\n"
+      "tbz x10, #0, 183f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v18.s }[2], [x21]\n"
+      "st1 { v26.s }[2], [x20]\n"
+      "b 183f\n"
+      "178:"  // Height 5: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 183f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s18, [x21, #0x0]\n"
+      "str s26, [x20, #0x0]\n"
+      "b 183f\n"
+      "179:"  // Height 5: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 181f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x21], #0x10\n"
+      "st1 { v24.4s }, [x20], #0x10\n"
+      "tbz x10, #1, 180f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d17, [x21], #0x8\n"
+      "str d25, [x20], #0x8\n"
+      "tbz x10, #0, 183f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v17.s }[2], [x21]\n"
+      "st1 { v25.s }[2], [x20]\n"
+      "b 183f\n"
+      "180:"  // Height 5: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 183f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s17, [x21, #0x0]\n"
+      "str s25, [x20, #0x0]\n"
+      "b 183f\n"
+      "181:"  // Height 5: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 182f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d15, [x22], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "str d24, [x20], #0x8\n"
+      "tbz x10, #0, 183f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "st1 { v15.s }[2], [x22]\n"
+      "st1 { v16.s }[2], [x21]\n"
+      "st1 { v24.s }[2], [x20]\n"
+      "b 183f\n"
+      "182:"  // Height 5: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "str s15, [x22, #0x0]\n"
+      "str s16, [x21, #0x0]\n"
+      "str s24, [x20, #0x0]\n"
+      "183:"  // Height 5: Partial direct writeback: Done
+      "b 185f\n"
+      "184:"  // Height 5: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q15, [x22, #0x0]\n"
+      "str q20, [x22, #0x10]\n"
+      "str q21, [x22, #0x20]\n"
+      "str q22, [x22, #0x30]\n"
+      "str q16, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "str q18, [x21, #0x20]\n"
+      "str q19, [x21, #0x30]\n"
+      "str q24, [x20, #0x0]\n"
+      "str q25, [x20, #0x10]\n"
+      "str q26, [x20, #0x20]\n"
+      "str q27, [x20, #0x30]\n"
+      "185:"  // Height 5: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 150b\n"
+      "b 224f\n"
+      "186:"  // Height 6
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x20, #0x18\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "187:"  // Height 6: Column loop
+      "tbz %x[flags], #0, 198f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "cmp x10, #0x10\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "add x19, x20, x19, LSL #2\n"
+      "bge 196f\n"
+      "tbz x10, #3, 191f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v25.4s }, [x20], #0x10\n"
+      "ld1 { v10.4s }, [x28], #0x10\n"
+      "ld1 { v13.4s }, [x23], #0x10\n"
+      "ld1 { v18.4s }, [x22], #0x10\n"
+      "ld1 { v21.4s }, [x21], #0x10\n"
+      "ld1 { v26.4s }, [x20], #0x10\n"
+      "ld1 { v28.4s }, [x19], #0x10\n"
+      "ld1 { v29.4s }, [x19], #0x10\n"
+      "tbz x10, #2, 189f\n"
+      "ld1 { v11.4s }, [x28], #0x10\n"
+      "ld1 { v14.4s }, [x23], #0x10\n"
+      "ld1 { v19.4s }, [x22], #0x10\n"
+      "ld1 { v22.4s }, [x21], #0x10\n"
+      "ld1 { v27.4s }, [x20], #0x10\n"
+      "ld1 { v30.4s }, [x19], #0x10\n"
+      "tbz x10, #1, 188f\n"
+      "ldr d16, [x28], #0x8\n"
+      "mov x24, #0x38\n"
+      "ldr d15, [x23], #0x8\n"
+      "ldr d24, [x22], #0x8\n"
+      "ldr d23, [x21], #0x8\n"
+      "ldr d6, [x20], #0x8\n"
+      "ldr d31, [x19], #0x8\n"
+      "tbz x10, #0, 195f\n"
+      "ld1 { v16.s }[2], [x28]\n"
+      "ld1 { v15.s }[2], [x23]\n"
+      "ld1 { v24.s }[2], [x22]\n"
+      "ld1 { v23.s }[2], [x21]\n"
+      "ld1 { v6.s }[2], [x20]\n"
+      "ld1 { v31.s }[2], [x19]\n"
+      "b 195f\n"
+      "188:"  // Height 6: Partial accumulate: partial_1_12
+      "mov x24, #0x30\n"
+      "tbz x10, #0, 195f\n"
+      "ldr s16, [x28, #0x0]\n"
+      "ldr s15, [x23, #0x0]\n"
+      "ldr s24, [x22, #0x0]\n"
+      "ldr s23, [x21, #0x0]\n"
+      "ldr s6, [x20, #0x0]\n"
+      "ldr s31, [x19, #0x0]\n"
+      "b 195f\n"
+      "189:"  // Height 6: Partial accumulate: partial_2_8
+      "tbz x10, #1, 190f\n"
+      "ldr d11, [x28], #0x8\n"
+      "ldr d14, [x23], #0x8\n"
+      "mov x24, #0x28\n"
+      "ldr d19, [x22], #0x8\n"
+      "ldr d22, [x21], #0x8\n"
+      "ldr d27, [x20], #0x8\n"
+      "ldr d30, [x19], #0x8\n"
+      "tbz x10, #0, 195f\n"
+      "ld1 { v11.s }[2], [x28]\n"
+      "ld1 { v14.s }[2], [x23]\n"
+      "ld1 { v19.s }[2], [x22]\n"
+      "ld1 { v22.s }[2], [x21]\n"
+      "ld1 { v27.s }[2], [x20]\n"
+      "ld1 { v30.s }[2], [x19]\n"
+      "b 195f\n"
+      "190:"  // Height 6: Partial accumulate: partial_1_8
+      "mov x24, #0x20\n"
+      "tbz x10, #0, 195f\n"
+      "ldr s11, [x28, #0x0]\n"
+      "ldr s14, [x23, #0x0]\n"
+      "ldr s19, [x22, #0x0]\n"
+      "ldr s22, [x21, #0x0]\n"
+      "ldr s27, [x20, #0x0]\n"
+      "ldr s30, [x19, #0x0]\n"
+      "b 195f\n"
+      "191:"  // Height 6: Partial accumulate: partial_4_0
+      "tbz x10, #2, 193f\n"
+      "ld1 { v9.4s }, [x28], #0x10\n"
+      "ld1 { v12.4s }, [x23], #0x10\n"
+      "ld1 { v17.4s }, [x22], #0x10\n"
+      "ld1 { v20.4s }, [x21], #0x10\n"
+      "ld1 { v25.4s }, [x20], #0x10\n"
+      "ld1 { v28.4s }, [x19], #0x10\n"
+      "tbz x10, #1, 192f\n"
+      "ldr d10, [x28], #0x8\n"
+      "mov x24, #0x18\n"
+      "ldr d13, [x23], #0x8\n"
+      "ldr d18, [x22], #0x8\n"
+      "ldr d21, [x21], #0x8\n"
+      "ldr d26, [x20], #0x8\n"
+      "ldr d29, [x19], #0x8\n"
+      "tbz x10, #0, 195f\n"
+      "ld1 { v10.s }[2], [x28]\n"
+      "ld1 { v13.s }[2], [x23]\n"
+      "ld1 { v18.s }[2], [x22]\n"
+      "ld1 { v21.s }[2], [x21]\n"
+      "ld1 { v26.s }[2], [x20]\n"
+      "ld1 { v29.s }[2], [x19]\n"
+      "b 195f\n"
+      "192:"  // Height 6: Partial accumulate: partial_1_4
+      "mov x24, #0x10\n"
+      "tbz x10, #0, 195f\n"
+      "ldr s10, [x28, #0x0]\n"
+      "ldr s13, [x23, #0x0]\n"
+      "ldr s18, [x22, #0x0]\n"
+      "ldr s21, [x21, #0x0]\n"
+      "ldr s26, [x20, #0x0]\n"
+      "ldr s29, [x19, #0x0]\n"
+      "b 195f\n"
+      "193:"  // Height 6: Partial accumulate: partial_2_0
+      "tbz x10, #1, 194f\n"
+      "ldr d9, [x28], #0x8\n"
+      "ldr d12, [x23], #0x8\n"
+      "mov x24, #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "ldr d20, [x21], #0x8\n"
+      "ldr d25, [x20], #0x8\n"
+      "ldr d28, [x19], #0x8\n"
+      "tbz x10, #0, 195f\n"
+      "ld1 { v9.s }[2], [x28]\n"
+      "ld1 { v12.s }[2], [x23]\n"
+      "ld1 { v17.s }[2], [x22]\n"
+      "ld1 { v20.s }[2], [x21]\n"
+      "ld1 { v25.s }[2], [x20]\n"
+      "ld1 { v28.s }[2], [x19]\n"
+      "b 195f\n"
+      "194:"  // Height 6: Partial accumulate: partial_1_0
+      "ldr s9, [x28, #0x0]\n"
+      "mov x24, #0x0\n"
+      "ldr s12, [x23, #0x0]\n"
+      "ldr s17, [x22, #0x0]\n"
+      "ldr s20, [x21, #0x0]\n"
+      "ldr s25, [x20, #0x0]\n"
+      "ldr s28, [x19, #0x0]\n"
+      "195:"  // Height 6: Partial accumulate: Done
+      "sub x28, x28, x24\n"
+      "b 197f\n"
+      "196:"  // Height 6: full accumulate
+      "ldr q9, [x28, #0x0]\n"
+      "ldr q10, [x28, #0x10]\n"
+      "ldr q11, [x28, #0x20]\n"
+      "ldr q16, [x28, #0x30]\n"
+      "ldr q12, [x23, #0x0]\n"
+      "ldr q13, [x23, #0x10]\n"
+      "ldr q14, [x23, #0x20]\n"
+      "ldr q15, [x23, #0x30]\n"
+      "ldr q17, [x22, #0x0]\n"
+      "ldr q18, [x22, #0x10]\n"
+      "ldr q19, [x22, #0x20]\n"
+      "ldr q24, [x22, #0x30]\n"
+      "ldr q20, [x21, #0x0]\n"
+      "ldr q21, [x21, #0x10]\n"
+      "ldr q22, [x21, #0x20]\n"
+      "ldr q23, [x21, #0x30]\n"
+      "ldr q25, [x20, #0x0]\n"
+      "ldr q26, [x20, #0x10]\n"
+      "ldr q27, [x20, #0x20]\n"
+      "ldr q6, [x20, #0x30]\n"
+      "ldr q28, [x19, #0x0]\n"
+      "ldr q29, [x19, #0x10]\n"
+      "ldr q30, [x19, #0x20]\n"
+      "ldr q31, [x19, #0x30]\n"
+      "197:"  // Height 6: MMLA fixup
+      "zip1 v8.2d, v9.2d, v12.2d\n"
+      "zip2 v12.2d, v9.2d, v12.2d\n"
+      "zip1 v9.2d, v10.2d, v13.2d\n"
+      "zip2 v13.2d, v10.2d, v13.2d\n"
+      "zip1 v10.2d, v11.2d, v14.2d\n"
+      "zip2 v14.2d, v11.2d, v14.2d\n"
+      "zip1 v11.2d, v16.2d, v15.2d\n"
+      "zip2 v15.2d, v16.2d, v15.2d\n"
+      "zip1 v16.2d, v17.2d, v20.2d\n"
+      "zip2 v20.2d, v17.2d, v20.2d\n"
+      "zip1 v17.2d, v18.2d, v21.2d\n"
+      "zip2 v21.2d, v18.2d, v21.2d\n"
+      "zip1 v18.2d, v19.2d, v22.2d\n"
+      "zip2 v22.2d, v19.2d, v22.2d\n"
+      "zip1 v19.2d, v24.2d, v23.2d\n"
+      "zip2 v23.2d, v24.2d, v23.2d\n"
+      "zip1 v24.2d, v25.2d, v28.2d\n"
+      "zip2 v28.2d, v25.2d, v28.2d\n"
+      "zip1 v25.2d, v26.2d, v29.2d\n"
+      "zip2 v29.2d, v26.2d, v29.2d\n"
+      "zip1 v26.2d, v27.2d, v30.2d\n"
+      "zip2 v30.2d, v27.2d, v30.2d\n"
+      "zip1 v27.2d, v6.2d, v31.2d\n"
+      "zip2 v31.2d, v6.2d, v31.2d\n"
+      "b 199f\n"
+      "198:"  // Height 6: no accumulate
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "199:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "200:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 201f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 202f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "add x20, x20, x19\n"
+      "b 202f\n"
+      "201:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "add x20, x21, x19\n"
+      "202:"  // Height 6: input setup done
+      "cmp x26, #0x10\n"
+      "blt 205f\n"
+      "ldr q1, [x25, #0x0]\n"
+      "ldr q2, [x24, #0x0]\n"
+      "cmp x26, #0x20\n"
+      "blt 204f\n"
+      "203:"  // Height 6: Multiply loop: Main loop head
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x20, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x21, x21, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "sub x26, x26, #0x10\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      "cmp x26, #0x20\n"
+      ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e86a49c  // ummla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49d  // ummla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49e  // ummla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      "ldr q2, [x24, #0x0]\n"
+      ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4b8  // ummla v24.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bc  // ummla v28.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4b9  // ummla v25.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bd  // ummla v29.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4ba  // ummla v26.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4be  // ummla v30.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4bb  // ummla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      "ldr q1, [x25, #0x0]\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bf  // ummla v31.4s, v5.16b, v6.16b\n"
+      "bge 203b\n"
+      "204:"  // Height 6: Multiply loop: Single iteration only
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q3, [x23, #0x0]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 v1.2d, v1.2d, v2.2d\n"
+      "ldr q4, [x22, #0x0]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr q5, [x21, #0x0]\n"
+      "add x24, x24, #0x10\n"
+      "trn2 v3.2d, v3.2d, v4.2d\n"
+      "ldr q6, [x20, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 v5.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x24, #0x80]\n"
+      ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      "prfm pldl1keep, [x23, #0x80]\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      "prfm pldl1keep, [x22, #0x80]\n"
+      ".inst 0x6e86a49c  // ummla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      "prfm pldl1keep, [x21, #0x80]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49d  // ummla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49e  // ummla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x80]\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x90]\n"
+      ".inst 0x6e87a428  // ummla v8.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a470  // ummla v16.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4b8  // ummla v24.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xa0]\n"
+      ".inst 0x6e86a42c  // ummla v12.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a474  // ummla v20.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bc  // ummla v28.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xb0]\n"
+      ".inst 0x6e87a429  // ummla v9.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a471  // ummla v17.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4b9  // ummla v25.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xc0]\n"
+      ".inst 0x6e86a42d  // ummla v13.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a475  // ummla v21.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bd  // ummla v29.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xd0]\n"
+      ".inst 0x6e87a42a  // ummla v10.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a472  // ummla v18.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4ba  // ummla v26.4s, v5.16b, v7.16b\n"
+      "ldr q7, [x9, #0xe0]\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a476  // ummla v22.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4be  // ummla v30.4s, v5.16b, v6.16b\n"
+      "ldr q6, [x9, #0xf0]\n"
+      "add x9, x9, #0x100\n"
+      ".inst 0x6e87a42b  // ummla v11.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e87a473  // ummla v19.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e87a4bb  // ummla v27.4s, v5.16b, v7.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a477  // ummla v23.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e86a4bf  // ummla v31.4s, v5.16b, v6.16b\n"
+      "205:"  // Height 6: Multiply loop: Main loop skip
+      "cbz x26, 212f\n"
+      "cmp x26, #0x8\n"
+      "blt 207f\n"
+      "206:"  // Height 6: Multiply loop: Odd block loop
+      "ldr d1, [x25], #0x8\n"
+      "sub x26, x26, #0x8\n"
+      "ldr d2, [x24], #0x8\n"
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr d3, [x23], #0x8\n"
+      "cmp x26, #0x8\n"
+      "ldr d4, [x22], #0x8\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "ldr d5, [x21], #0x8\n"
+      "ldr d7, [x20], #0x8\n"
+      "trn1 v4.2d, v5.2d, v7.2d\n"
+      "ldr q6, [x9, #0x0]\n"
+      "ldr q7, [x9, #0x10]\n"
+      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a450  // ummla v16.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a498  // ummla v24.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x20]\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a454  // ummla v20.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49c  // ummla v28.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x30]\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a451  // ummla v17.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a499  // ummla v25.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x40]\n"
+      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a455  // ummla v21.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49d  // ummla v29.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x50]\n"
+      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a452  // ummla v18.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49a  // ummla v26.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x60]\n"
+      ".inst 0x6e87a40e  // ummla v14.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a456  // ummla v22.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49e  // ummla v30.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x6e86a40b  // ummla v11.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a453  // ummla v19.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49b  // ummla v27.4s, v4.16b, v6.16b\n"
+      ".inst 0x6e87a40f  // ummla v15.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49f  // ummla v31.4s, v4.16b, v7.16b\n"
+      "bge 206b\n"
+      "cbz x26, 212f\n"
+      "207:"  // Height 6: Multiply loop: Skip odd blocks
+      "tbz x26, #2, 209f\n"
+      "ldr s1, [x25], #0x4\n"
+      "ldr s2, [x24], #0x4\n"
+      "ldr s3, [x23], #0x4\n"
+      "ldr s4, [x22], #0x4\n"
+      "ldr s5, [x21], #0x4\n"
+      "ldr s6, [x20], #0x4\n"
+      "tbz x26, #1, 208f\n"
+      "ld1 { v1.h }[2], [x25], #0x2\n"
+      "ld1 { v2.h }[2], [x24], #0x2\n"
+      "ld1 { v3.h }[2], [x23], #0x2\n"
+      "ld1 { v4.h }[2], [x22], #0x2\n"
+      "ld1 { v5.h }[2], [x21], #0x2\n"
+      "ld1 { v6.h }[2], [x20], #0x2\n"
+      "tbz x26, #0, 211f\n"
+      "ld1 { v1.b }[6], [x25]\n"
+      "ld1 { v2.b }[6], [x24]\n"
+      "ld1 { v3.b }[6], [x23]\n"
+      "ld1 { v4.b }[6], [x22]\n"
+      "ld1 { v5.b }[6], [x21]\n"
+      "ld1 { v6.b }[6], [x20]\n"
+      "b 211f\n"
+      "208:"  // Height 6: Multiply loop: Ragged operand read: partial_1_4
+      "tbz x26, #0, 211f\n"
+      "ld1 { v1.b }[4], [x25]\n"
+      "ld1 { v2.b }[4], [x24]\n"
+      "ld1 { v3.b }[4], [x23]\n"
+      "ld1 { v4.b }[4], [x22]\n"
+      "ld1 { v5.b }[4], [x21]\n"
+      "ld1 { v6.b }[4], [x20]\n"
+      "b 211f\n"
+      "209:"  // Height 6: Multiply loop: Ragged operand read: partial_2_0
+      "tbz x26, #1, 210f\n"
+      "ldr h1, [x25], #0x2\n"
+      "ldr h2, [x24], #0x2\n"
+      "ldr h3, [x23], #0x2\n"
+      "ldr h4, [x22], #0x2\n"
+      "ldr h5, [x21], #0x2\n"
+      "ldr h6, [x20], #0x2\n"
+      "tbz x26, #0, 211f\n"
+      "ld1 { v1.b }[2], [x25]\n"
+      "ld1 { v2.b }[2], [x24]\n"
+      "ld1 { v3.b }[2], [x23]\n"
+      "ld1 { v4.b }[2], [x22]\n"
+      "ld1 { v5.b }[2], [x21]\n"
+      "ld1 { v6.b }[2], [x20]\n"
+      "b 211f\n"
+      "210:"  // Height 6: Multiply loop: Ragged operand read: partial_1_0
+      "ldr b1, [x25, #0x0]\n"
+      "ldr b2, [x24, #0x0]\n"
+      "ldr b3, [x23, #0x0]\n"
+      "ldr b4, [x22, #0x0]\n"
+      "ldr b5, [x21, #0x0]\n"
+      "ldr b6, [x20, #0x0]\n"
+      "211:"  // Height 6: Multiply loop: Ragged operand read: Done
+      "trn1 v0.2d, v1.2d, v2.2d\n"
+      "ldr q7, [x9, #0x0]\n"
+      "trn1 v2.2d, v3.2d, v4.2d\n"
+      "trn1 v4.2d, v5.2d, v6.2d\n"
+      "ldr q6, [x9, #0x10]\n"
+      ".inst 0x6e87a408  // ummla v8.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a450  // ummla v16.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a498  // ummla v24.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x20]\n"
+      ".inst 0x6e86a40c  // ummla v12.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49c  // ummla v28.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x30]\n"
+      ".inst 0x6e87a409  // ummla v9.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a451  // ummla v17.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a499  // ummla v25.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x40]\n"
+      ".inst 0x6e86a40d  // ummla v13.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49d  // ummla v29.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x50]\n"
+      ".inst 0x6e87a40a  // ummla v10.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a452  // ummla v18.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49a  // ummla v26.4s, v4.16b, v7.16b\n"
+      "ldr q7, [x9, #0x60]\n"
+      ".inst 0x6e86a40e  // ummla v14.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49e  // ummla v30.4s, v4.16b, v6.16b\n"
+      "ldr q6, [x9, #0x70]\n"
+      "add x9, x9, #0x80\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a453  // ummla v19.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a49b  // ummla v27.4s, v4.16b, v7.16b\n"
+      ".inst 0x6e86a40f  // ummla v15.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a457  // ummla v23.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a49f  // ummla v31.4s, v4.16b, v6.16b\n"
+      "212:"  // Height 6: Multiply loop: No odd multiplies
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 200b\n"
+      "uzp1 v7.2d, v8.2d, v12.2d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 v8.2d, v8.2d, v12.2d\n"
+      "prfm pstl1keep, [x28, #0x0]\n"
+      "cmp x10, #0x10\n"
+      "uzp1 v12.2d, v9.2d, v13.2d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 v9.2d, v9.2d, v13.2d\n"
+      "prfm pstl1keep, [x23, #0x0]\n"
+      "uzp1 v13.2d, v10.2d, v14.2d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 v10.2d, v10.2d, v14.2d\n"
+      "prfm pstl1keep, [x22, #0x0]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 v14.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x21, #0x0]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "uzp2 v11.2d, v11.2d, v15.2d\n"
+      "prfm pstl1keep, [x20, #0x0]\n"
+      "add x19, x20, x19, LSL #2\n"
+      "uzp1 v15.2d, v16.2d, v20.2d\n"
+      "prfm pstl1keep, [x19, #0x0]\n"
+      "uzp2 v16.2d, v16.2d, v20.2d\n"
+      "uzp1 v20.2d, v17.2d, v21.2d\n"
+      "uzp2 v17.2d, v17.2d, v21.2d\n"
+      "uzp1 v21.2d, v18.2d, v22.2d\n"
+      "uzp2 v18.2d, v18.2d, v22.2d\n"
+      "uzp1 v22.2d, v19.2d, v23.2d\n"
+      "uzp2 v19.2d, v19.2d, v23.2d\n"
+      "uzp1 v23.2d, v24.2d, v28.2d\n"
+      "uzp2 v24.2d, v24.2d, v28.2d\n"
+      "uzp1 v28.2d, v25.2d, v29.2d\n"
+      "uzp2 v25.2d, v25.2d, v29.2d\n"
+      "uzp1 v29.2d, v26.2d, v30.2d\n"
+      "uzp2 v26.2d, v26.2d, v30.2d\n"
+      "uzp1 v30.2d, v27.2d, v31.2d\n"
+      "uzp2 v27.2d, v27.2d, v31.2d\n"
+      "bge 221f\n"
+      "tbz x10, #3, 216f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v12.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v9.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v20.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x21], #0x10\n"
+      "st1 { v17.4s }, [x21], #0x10\n"
+      "st1 { v23.4s }, [x20], #0x10\n"
+      "st1 { v28.4s }, [x20], #0x10\n"
+      "st1 { v24.4s }, [x19], #0x10\n"
+      "st1 { v25.4s }, [x19], #0x10\n"
+      "tbz x10, #2, 214f\n"
+      "st1 { v13.4s }, [x28], #0x10\n"
+      "st1 { v10.4s }, [x23], #0x10\n"
+      "st1 { v21.4s }, [x22], #0x10\n"
+      "st1 { v18.4s }, [x21], #0x10\n"
+      "st1 { v29.4s }, [x20], #0x10\n"
+      "st1 { v26.4s }, [x19], #0x10\n"
+      "tbz x10, #1, 213f\n"
+      "str d14, [x28], #0x8\n"
+      "str d11, [x23], #0x8\n"
+      "str d22, [x22], #0x8\n"
+      "str d19, [x21], #0x8\n"
+      "str d30, [x20], #0x8\n"
+      "str d27, [x19], #0x8\n"
+      "tbz x10, #0, 220f\n"
+      "st1 { v14.s }[2], [x28]\n"
+      "st1 { v11.s }[2], [x23]\n"
+      "st1 { v22.s }[2], [x22]\n"
+      "st1 { v19.s }[2], [x21]\n"
+      "st1 { v30.s }[2], [x20]\n"
+      "st1 { v27.s }[2], [x19]\n"
+      "b 220f\n"
+      "213:"  // Height 6: Partial direct writeback: partial_1_12
+      "tbz x10, #0, 220f\n"
+      "str s14, [x28, #0x0]\n"
+      "str s11, [x23, #0x0]\n"
+      "str s22, [x22, #0x0]\n"
+      "str s19, [x21, #0x0]\n"
+      "str s30, [x20, #0x0]\n"
+      "str s27, [x19, #0x0]\n"
+      "b 220f\n"
+      "214:"  // Height 6: Partial direct writeback: partial_2_8
+      "tbz x10, #1, 215f\n"
+      "str d13, [x28], #0x8\n"
+      "str d10, [x23], #0x8\n"
+      "str d21, [x22], #0x8\n"
+      "str d18, [x21], #0x8\n"
+      "str d29, [x20], #0x8\n"
+      "str d26, [x19], #0x8\n"
+      "tbz x10, #0, 220f\n"
+      "st1 { v13.s }[2], [x28]\n"
+      "st1 { v10.s }[2], [x23]\n"
+      "st1 { v21.s }[2], [x22]\n"
+      "st1 { v18.s }[2], [x21]\n"
+      "st1 { v29.s }[2], [x20]\n"
+      "st1 { v26.s }[2], [x19]\n"
+      "b 220f\n"
+      "215:"  // Height 6: Partial direct writeback: partial_1_8
+      "tbz x10, #0, 220f\n"
+      "str s13, [x28, #0x0]\n"
+      "str s10, [x23, #0x0]\n"
+      "str s21, [x22, #0x0]\n"
+      "str s18, [x21, #0x0]\n"
+      "str s29, [x20, #0x0]\n"
+      "str s26, [x19, #0x0]\n"
+      "b 220f\n"
+      "216:"  // Height 6: Partial direct writeback: partial_4_0
+      "tbz x10, #2, 218f\n"
+      "st1 { v7.4s }, [x28], #0x10\n"
+      "st1 { v8.4s }, [x23], #0x10\n"
+      "st1 { v15.4s }, [x22], #0x10\n"
+      "st1 { v16.4s }, [x21], #0x10\n"
+      "st1 { v23.4s }, [x20], #0x10\n"
+      "st1 { v24.4s }, [x19], #0x10\n"
+      "tbz x10, #1, 217f\n"
+      "str d12, [x28], #0x8\n"
+      "str d9, [x23], #0x8\n"
+      "str d20, [x22], #0x8\n"
+      "str d17, [x21], #0x8\n"
+      "str d28, [x20], #0x8\n"
+      "str d25, [x19], #0x8\n"
+      "tbz x10, #0, 220f\n"
+      "st1 { v12.s }[2], [x28]\n"
+      "st1 { v9.s }[2], [x23]\n"
+      "st1 { v20.s }[2], [x22]\n"
+      "st1 { v17.s }[2], [x21]\n"
+      "st1 { v28.s }[2], [x20]\n"
+      "st1 { v25.s }[2], [x19]\n"
+      "b 220f\n"
+      "217:"  // Height 6: Partial direct writeback: partial_1_4
+      "tbz x10, #0, 220f\n"
+      "str s12, [x28, #0x0]\n"
+      "str s9, [x23, #0x0]\n"
+      "str s20, [x22, #0x0]\n"
+      "str s17, [x21, #0x0]\n"
+      "str s28, [x20, #0x0]\n"
+      "str s25, [x19, #0x0]\n"
+      "b 220f\n"
+      "218:"  // Height 6: Partial direct writeback: partial_2_0
+      "tbz x10, #1, 219f\n"
+      "str d7, [x28], #0x8\n"
+      "str d8, [x23], #0x8\n"
+      "str d15, [x22], #0x8\n"
+      "str d16, [x21], #0x8\n"
+      "str d23, [x20], #0x8\n"
+      "str d24, [x19], #0x8\n"
+      "tbz x10, #0, 220f\n"
+      "st1 { v7.s }[2], [x28]\n"
+      "st1 { v8.s }[2], [x23]\n"
+      "st1 { v15.s }[2], [x22]\n"
+      "st1 { v16.s }[2], [x21]\n"
+      "st1 { v23.s }[2], [x20]\n"
+      "st1 { v24.s }[2], [x19]\n"
+      "b 220f\n"
+      "219:"  // Height 6: Partial direct writeback: partial_1_0
+      "str s7, [x28, #0x0]\n"
+      "str s8, [x23, #0x0]\n"
+      "str s15, [x22, #0x0]\n"
+      "str s16, [x21, #0x0]\n"
+      "str s23, [x20, #0x0]\n"
+      "str s24, [x19, #0x0]\n"
+      "220:"  // Height 6: Partial direct writeback: Done
+      "b 222f\n"
+      "221:"  // Height 6: Full writeback
+      "str q7, [x28, #0x0]\n"
+      "str q12, [x28, #0x10]\n"
+      "str q13, [x28, #0x20]\n"
+      "str q14, [x28, #0x30]\n"
+      "add x28, x28, #0x40\n"
+      "str q8, [x23, #0x0]\n"
+      "str q9, [x23, #0x10]\n"
+      "str q10, [x23, #0x20]\n"
+      "str q11, [x23, #0x30]\n"
+      "str q15, [x22, #0x0]\n"
+      "str q20, [x22, #0x10]\n"
+      "str q21, [x22, #0x20]\n"
+      "str q22, [x22, #0x30]\n"
+      "str q16, [x21, #0x0]\n"
+      "str q17, [x21, #0x10]\n"
+      "str q18, [x21, #0x20]\n"
+      "str q19, [x21, #0x30]\n"
+      "str q23, [x20, #0x0]\n"
+      "str q28, [x20, #0x10]\n"
+      "str q29, [x20, #0x20]\n"
+      "str q30, [x20, #0x30]\n"
+      "str q24, [x19, #0x0]\n"
+      "str q25, [x19, #0x10]\n"
+      "str q26, [x19, #0x20]\n"
+      "str q27, [x19, #0x30]\n"
+      "222:"  // Height 6: Writeback done
+      "subs x10, x10, #0x10\n"
+      "bgt 187b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 224f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 223f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "223:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "224:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
index 2fea5ad2e7..153a4cc167 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,63 +10,92 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
 
 #ifdef __aarch64__
-
-#include "../bfloat.hpp"
 #include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const bfloat16 *, const bfloat16 *, \
+    float *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_dot_8x12( ARGLIST );
 
-class cls_a64_interleaved_bf16fp32_dot_8x12 {
+class cls_a64_interleaved_bf16fp32_dot_8x12
+{
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
     static unsigned int out_width()
     {
         return 12;
     }
 
-    static unsigned int out_height()
+    static unsigned int stripe_width()
     {
-        return 8;
+        return 4;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 2;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsFixed<operand_type, result_type, 8, 12, 2> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 2, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=a64_interleaved_bf16fp32_dot_8x12;
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 15.93, 4.16, 7.19 };
+                case CPUModel::V1:
+                    return { 20.88, 5.10, 6.57 };
+                case CPUModel::A510:
+                    return { 7.77, 3.69, 3.02 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
+    // Default to the generic kernel
+    kern_type kernel=a64_interleaved_bf16fp32_dot_8x12;
     cls_a64_interleaved_bf16fp32_dot_8x12(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
index 92149a5579..5689f89781 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,305 +23,231 @@
  */
 #ifdef __aarch64__
 
+#include <cstddef>
 #include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const bfloat16 *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
+void a64_interleaved_bf16fp32_dot_8x12(
+    const bfloat16 *Apanel, const bfloat16 *Bpanel,
+    float *Cpanel, int ablocks, int bblocks, int K) {
 
-    K /= 2;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const bfloat16 *a_ptr0 = a_ptr;
-        const bfloat16 *b_ptr = Bpanel;
+    ka.bblocks = bblocks;
+    ka.K = (K/2) - 1;
+    ka.Bpanel = Bpanel;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
+    __asm__ __volatile__(
 
-            __asm __volatile (
-                "movi v8.4s, #0\n"
-                "ldr q0, [%[a_ptr]]\n"
-                "movi v9.4s, #0\n"
-                "ldr q4, [%[b_ptr]]\n"
-                "movi v10.4s, #0\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                "movi v11.4s, #0\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                "movi v12.4s, #0\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                "movi v13.4s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "movi v14.4s, #0\n"
-                "add %[b_ptr], %[b_ptr], #0x30\n"
-                "movi v15.4s, #0\n"
-                "movi v16.4s, #0\n"
-                "movi v17.4s, #0\n"
-                "movi v18.4s, #0\n"
-                "movi v19.4s, #0\n"
-                "movi v20.4s, #0\n"
-                "movi v21.4s, #0\n"
-                "movi v22.4s, #0\n"
-                "movi v23.4s, #0\n"
-                "movi v24.4s, #0\n"
-                "movi v25.4s, #0\n"
-                "movi v26.4s, #0\n"
-                "movi v27.4s, #0\n"
-                "movi v28.4s, #0\n"
-                "movi v29.4s, #0\n"
-                "movi v30.4s, #0\n"
-                "movi v31.4s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
-                "ldr q6, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
-                ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f897 // bfdot v23.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x4f40f0ac // bfdot v12.4s, v5.8h, v0.h[0]\n"
-                ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n"
-                ".inst 0x4f40f8ae // bfdot v14.4s, v5.8h, v0.h[2]\n"
-                ".inst 0x4f60f8af // bfdot v15.4s, v5.8h, v0.h[3]\n"
-                ".inst 0x4f41f0b8 // bfdot v24.4s, v5.8h, v1.h[0]\n"
-                ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n"
-                ".inst 0x4f41f8ba // bfdot v26.4s, v5.8h, v1.h[2]\n"
-                ".inst 0x4f61f8bb // bfdot v27.4s, v5.8h, v1.h[3]\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x4f40f0d0 // bfdot v16.4s, v6.8h, v0.h[0]\n"
-                ".inst 0x4f60f0d1 // bfdot v17.4s, v6.8h, v0.h[1]\n"
-                ".inst 0x4f40f8d2 // bfdot v18.4s, v6.8h, v0.h[2]\n"
-                ".inst 0x4f60f8d3 // bfdot v19.4s, v6.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x4f41f0dc // bfdot v28.4s, v6.8h, v1.h[0]\n"
-                ".inst 0x4f61f0dd // bfdot v29.4s, v6.8h, v1.h[1]\n"
-                ".inst 0x4f41f8de // bfdot v30.4s, v6.8h, v1.h[2]\n"
-                ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n"
-                "add %[b_ptr], %[b_ptr], #0x60\n"
-                ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n"
-                ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
-                ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n"
-                ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n"
-                ".inst 0x4f63f897 // bfdot v23.4s, v4.8h, v3.h[3]\n"
-                "ldr q4, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4f42f0ac // bfdot v12.4s, v5.8h, v2.h[0]\n"
-                ".inst 0x4f62f0ad // bfdot v13.4s, v5.8h, v2.h[1]\n"
-                ".inst 0x4f42f8ae // bfdot v14.4s, v5.8h, v2.h[2]\n"
-                ".inst 0x4f62f8af // bfdot v15.4s, v5.8h, v2.h[3]\n"
-                ".inst 0x4f43f0b8 // bfdot v24.4s, v5.8h, v3.h[0]\n"
-                ".inst 0x4f63f0b9 // bfdot v25.4s, v5.8h, v3.h[1]\n"
-                ".inst 0x4f43f8ba // bfdot v26.4s, v5.8h, v3.h[2]\n"
-                ".inst 0x4f63f8bb // bfdot v27.4s, v5.8h, v3.h[3]\n"
-                "ldr q5, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
-                ".inst 0x4f62f0d1 // bfdot v17.4s, v6.8h, v2.h[1]\n"
-                ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
-                ".inst 0x4f62f8d3 // bfdot v19.4s, v6.8h, v2.h[3]\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4f43f0dc // bfdot v28.4s, v6.8h, v3.h[0]\n"
-                ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n"
-                ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n"
-                ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
-                "ldr q6, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
-                ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f897 // bfdot v23.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x4f40f0ac // bfdot v12.4s, v5.8h, v0.h[0]\n"
-                ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n"
-                ".inst 0x4f40f8ae // bfdot v14.4s, v5.8h, v0.h[2]\n"
-                ".inst 0x4f60f8af // bfdot v15.4s, v5.8h, v0.h[3]\n"
-                ".inst 0x4f41f0b8 // bfdot v24.4s, v5.8h, v1.h[0]\n"
-                ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n"
-                ".inst 0x4f41f8ba // bfdot v26.4s, v5.8h, v1.h[2]\n"
-                ".inst 0x4f61f8bb // bfdot v27.4s, v5.8h, v1.h[3]\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x4f40f0d0 // bfdot v16.4s, v6.8h, v0.h[0]\n"
-                ".inst 0x4f60f0d1 // bfdot v17.4s, v6.8h, v0.h[1]\n"
-                ".inst 0x4f40f8d2 // bfdot v18.4s, v6.8h, v0.h[2]\n"
-                ".inst 0x4f60f8d3 // bfdot v19.4s, v6.8h, v0.h[3]\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x4f41f0dc // bfdot v28.4s, v6.8h, v1.h[0]\n"
-                ".inst 0x4f61f0dd // bfdot v29.4s, v6.8h, v1.h[1]\n"
-                ".inst 0x4f41f8de // bfdot v30.4s, v6.8h, v1.h[2]\n"
-                ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n"
-                "add %[b_ptr], %[b_ptr], #0x60\n"
-                ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n"
-                ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
-                ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n"
-                ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n"
-                ".inst 0x4f63f897 // bfdot v23.4s, v4.8h, v3.h[3]\n"
-                "ldr q4, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4f42f0ac // bfdot v12.4s, v5.8h, v2.h[0]\n"
-                ".inst 0x4f62f0ad // bfdot v13.4s, v5.8h, v2.h[1]\n"
-                ".inst 0x4f42f8ae // bfdot v14.4s, v5.8h, v2.h[2]\n"
-                ".inst 0x4f62f8af // bfdot v15.4s, v5.8h, v2.h[3]\n"
-                ".inst 0x4f43f0b8 // bfdot v24.4s, v5.8h, v3.h[0]\n"
-                ".inst 0x4f63f0b9 // bfdot v25.4s, v5.8h, v3.h[1]\n"
-                ".inst 0x4f43f8ba // bfdot v26.4s, v5.8h, v3.h[2]\n"
-                ".inst 0x4f63f8bb // bfdot v27.4s, v5.8h, v3.h[3]\n"
-                "ldr q5, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
-                ".inst 0x4f62f0d1 // bfdot v17.4s, v6.8h, v2.h[1]\n"
-                ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
-                ".inst 0x4f62f8d3 // bfdot v19.4s, v6.8h, v2.h[3]\n"
-                ".inst 0x4f43f0dc // bfdot v28.4s, v6.8h, v3.h[0]\n"
-                ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n"
-                ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n"
-                ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
-                "ldr q6, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
-                ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
-                ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
-                ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
-                "str q8, [%[c_ptr]]\n"
-                ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f897 // bfdot v23.4s, v4.8h, v1.h[3]\n"
-                ".inst 0x4f40f0ac // bfdot v12.4s, v5.8h, v0.h[0]\n"
-                ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n"
-                ".inst 0x4f40f8ae // bfdot v14.4s, v5.8h, v0.h[2]\n"
-                ".inst 0x4f60f8af // bfdot v15.4s, v5.8h, v0.h[3]\n"
-                "str q12, [%[c_ptr], #0x10]\n"
-                ".inst 0x4f41f0b8 // bfdot v24.4s, v5.8h, v1.h[0]\n"
-                ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n"
-                ".inst 0x4f41f8ba // bfdot v26.4s, v5.8h, v1.h[2]\n"
-                ".inst 0x4f61f8bb // bfdot v27.4s, v5.8h, v1.h[3]\n"
-                ".inst 0x4f40f0d0 // bfdot v16.4s, v6.8h, v0.h[0]\n"
-                ".inst 0x4f60f0d1 // bfdot v17.4s, v6.8h, v0.h[1]\n"
-                ".inst 0x4f40f8d2 // bfdot v18.4s, v6.8h, v0.h[2]\n"
-                ".inst 0x4f60f8d3 // bfdot v19.4s, v6.8h, v0.h[3]\n"
-                "str q16, [%[c_ptr], #0x20]\n"
-                ".inst 0x4f41f0dc // bfdot v28.4s, v6.8h, v1.h[0]\n"
-                ".inst 0x4f61f0dd // bfdot v29.4s, v6.8h, v1.h[1]\n"
-                ".inst 0x4f41f8de // bfdot v30.4s, v6.8h, v1.h[2]\n"
-                "str q9, [%[c_ptr], #0x30]\n"
-                ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
-                "ldr q6, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
-                "add %[b_ptr], %[b_ptr], #0x30\n"
-                ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
-                ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
-                ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
-                ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
-                ".inst 0x4f61f897 // bfdot v23.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4f40f0ac // bfdot v12.4s, v5.8h, v0.h[0]\n"
-                ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n"
-                ".inst 0x4f40f8ae // bfdot v14.4s, v5.8h, v0.h[2]\n"
-                ".inst 0x4f60f8af // bfdot v15.4s, v5.8h, v0.h[3]\n"
-                ".inst 0x4f41f0b8 // bfdot v24.4s, v5.8h, v1.h[0]\n"
-                ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n"
-                ".inst 0x4f41f8ba // bfdot v26.4s, v5.8h, v1.h[2]\n"
-                ".inst 0x4f61f8bb // bfdot v27.4s, v5.8h, v1.h[3]\n"
-                "ldr q5, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4f40f0d0 // bfdot v16.4s, v6.8h, v0.h[0]\n"
-                ".inst 0x4f60f0d1 // bfdot v17.4s, v6.8h, v0.h[1]\n"
-                ".inst 0x4f40f8d2 // bfdot v18.4s, v6.8h, v0.h[2]\n"
-                ".inst 0x4f60f8d3 // bfdot v19.4s, v6.8h, v0.h[3]\n"
-                ".inst 0x4f41f0dc // bfdot v28.4s, v6.8h, v1.h[0]\n"
-                ".inst 0x4f61f0dd // bfdot v29.4s, v6.8h, v1.h[1]\n"
-                ".inst 0x4f41f8de // bfdot v30.4s, v6.8h, v1.h[2]\n"
-                ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
-                "ldr q6, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n"
-                ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n"
-                ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n"
-                ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n"
-                "str q8, [%[c_ptr]]\n"
-                ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
-                ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n"
-                ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n"
-                ".inst 0x4f63f897 // bfdot v23.4s, v4.8h, v3.h[3]\n"
-                ".inst 0x4f42f0ac // bfdot v12.4s, v5.8h, v2.h[0]\n"
-                ".inst 0x4f62f0ad // bfdot v13.4s, v5.8h, v2.h[1]\n"
-                ".inst 0x4f42f8ae // bfdot v14.4s, v5.8h, v2.h[2]\n"
-                ".inst 0x4f62f8af // bfdot v15.4s, v5.8h, v2.h[3]\n"
-                "str q12, [%[c_ptr], #0x10]\n"
-                ".inst 0x4f43f0b8 // bfdot v24.4s, v5.8h, v3.h[0]\n"
-                ".inst 0x4f63f0b9 // bfdot v25.4s, v5.8h, v3.h[1]\n"
-                ".inst 0x4f43f8ba // bfdot v26.4s, v5.8h, v3.h[2]\n"
-                ".inst 0x4f63f8bb // bfdot v27.4s, v5.8h, v3.h[3]\n"
-                ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
-                ".inst 0x4f62f0d1 // bfdot v17.4s, v6.8h, v2.h[1]\n"
-                ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
-                ".inst 0x4f62f8d3 // bfdot v19.4s, v6.8h, v2.h[3]\n"
-                "str q16, [%[c_ptr], #0x20]\n"
-                ".inst 0x4f43f0dc // bfdot v28.4s, v6.8h, v3.h[0]\n"
-                ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n"
-                ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n"
-                "str q9, [%[c_ptr], #0x30]\n"
-                ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
-                "4:\n"
-                "str q13, [%[c_ptr], #0x40]\n"
-                "str q17, [%[c_ptr], #0x50]\n"
-                "str q10, [%[c_ptr], #0x60]\n"
-                "str q14, [%[c_ptr], #0x70]\n"
-                "str q18, [%[c_ptr], #0x80]\n"
-                "str q11, [%[c_ptr], #0x90]\n"
-                "str q15, [%[c_ptr], #0xa0]\n"
-                "str q19, [%[c_ptr], #0xb0]\n"
-                "str q20, [%[c_ptr], #0xc0]\n"
-                "str q24, [%[c_ptr], #0xd0]\n"
-                "str q28, [%[c_ptr], #0xe0]\n"
-                "str q21, [%[c_ptr], #0xf0]\n"
-                "str q25, [%[c_ptr], #0x100]\n"
-                "str q29, [%[c_ptr], #0x110]\n"
-                "str q22, [%[c_ptr], #0x120]\n"
-                "str q26, [%[c_ptr], #0x130]\n"
-                "str q30, [%[c_ptr], #0x140]\n"
-                "str q23, [%[c_ptr], #0x150]\n"
-                "str q27, [%[c_ptr], #0x160]\n"
-                "str q31, [%[c_ptr], #0x170]\n"
-                "add %[c_ptr], %[c_ptr], #0x180\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x40]\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "ldr q4, [x20, #0x0]\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "ldr q5, [x20, #0x10]\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "ldr q6, [x20, #0x20]\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      ".inst 0x4f40f088  // bfdot v8.4s, v4.8h, v0.h[0]\n"
+      ".inst 0x4f60f08b  // bfdot v11.4s, v4.8h, v0.h[1]\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      ".inst 0x4f40f88e  // bfdot v14.4s, v4.8h, v0.h[2]\n"
+      ".inst 0x4f60f891  // bfdot v17.4s, v4.8h, v0.h[3]\n"
+      "ldr q3, [%x[Apanel], #0x30]\n"
+      ".inst 0x4f41f094  // bfdot v20.4s, v4.8h, v1.h[0]\n"
+      ".inst 0x4f61f097  // bfdot v23.4s, v4.8h, v1.h[1]\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x4f41f89a  // bfdot v26.4s, v4.8h, v1.h[2]\n"
+      ".inst 0x4f61f89d  // bfdot v29.4s, v4.8h, v1.h[3]\n"
+      "ldr q4, [x20, #0x30]\n"
+      ".inst 0x4f40f0a9  // bfdot v9.4s, v5.8h, v0.h[0]\n"
+      ".inst 0x4f60f0ac  // bfdot v12.4s, v5.8h, v0.h[1]\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x4f40f8af  // bfdot v15.4s, v5.8h, v0.h[2]\n"
+      ".inst 0x4f60f8b2  // bfdot v18.4s, v5.8h, v0.h[3]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      ".inst 0x4f41f0b5  // bfdot v21.4s, v5.8h, v1.h[0]\n"
+      ".inst 0x4f61f0b8  // bfdot v24.4s, v5.8h, v1.h[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x4f41f8bb  // bfdot v27.4s, v5.8h, v1.h[2]\n"
+      ".inst 0x4f61f8be  // bfdot v30.4s, v5.8h, v1.h[3]\n"
+      "ldr q5, [x20, #0x40]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f60f0cd  // bfdot v13.4s, v6.8h, v0.h[1]\n"
+      "prfm pldl1keep, [x20, #0x100]\n"
+      ".inst 0x4f40f8d0  // bfdot v16.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f60f8d3  // bfdot v19.4s, v6.8h, v0.h[3]\n"
+      "prfm pldl1keep, [x20, #0x140]\n"
+      ".inst 0x4f41f0d6  // bfdot v22.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f61f0d9  // bfdot v25.4s, v6.8h, v1.h[1]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      ".inst 0x4f41f8dc  // bfdot v28.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f61f8df  // bfdot v31.4s, v6.8h, v1.h[3]\n"
+      "ldr q6, [x20, #0x50]\n"
+      "add x20, x20, #0x60\n"
+      ".inst 0x4f42f088  // bfdot v8.4s, v4.8h, v2.h[0]\n"
+      ".inst 0x4f62f08b  // bfdot v11.4s, v4.8h, v2.h[1]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      ".inst 0x4f42f88e  // bfdot v14.4s, v4.8h, v2.h[2]\n"
+      ".inst 0x4f62f891  // bfdot v17.4s, v4.8h, v2.h[3]\n"
+      ".inst 0x4f43f094  // bfdot v20.4s, v4.8h, v3.h[0]\n"
+      ".inst 0x4f63f097  // bfdot v23.4s, v4.8h, v3.h[1]\n"
+      ".inst 0x4f43f89a  // bfdot v26.4s, v4.8h, v3.h[2]\n"
+      ".inst 0x4f63f89d  // bfdot v29.4s, v4.8h, v3.h[3]\n"
+      "ldr q4, [x20, #0x0]\n"
+      ".inst 0x4f42f0a9  // bfdot v9.4s, v5.8h, v2.h[0]\n"
+      ".inst 0x4f62f0ac  // bfdot v12.4s, v5.8h, v2.h[1]\n"
+      ".inst 0x4f42f8af  // bfdot v15.4s, v5.8h, v2.h[2]\n"
+      ".inst 0x4f62f8b2  // bfdot v18.4s, v5.8h, v2.h[3]\n"
+      ".inst 0x4f43f0b5  // bfdot v21.4s, v5.8h, v3.h[0]\n"
+      ".inst 0x4f63f0b8  // bfdot v24.4s, v5.8h, v3.h[1]\n"
+      ".inst 0x4f43f8bb  // bfdot v27.4s, v5.8h, v3.h[2]\n"
+      ".inst 0x4f63f8be  // bfdot v30.4s, v5.8h, v3.h[3]\n"
+      "ldr q5, [x20, #0x10]\n"
+      ".inst 0x4f42f0ca  // bfdot v10.4s, v6.8h, v2.h[0]\n"
+      ".inst 0x4f62f0cd  // bfdot v13.4s, v6.8h, v2.h[1]\n"
+      ".inst 0x4f42f8d0  // bfdot v16.4s, v6.8h, v2.h[2]\n"
+      ".inst 0x4f62f8d3  // bfdot v19.4s, v6.8h, v2.h[3]\n"
+      ".inst 0x4f43f0d6  // bfdot v22.4s, v6.8h, v3.h[0]\n"
+      ".inst 0x4f63f0d9  // bfdot v25.4s, v6.8h, v3.h[1]\n"
+      ".inst 0x4f43f8dc  // bfdot v28.4s, v6.8h, v3.h[2]\n"
+      ".inst 0x4f63f8df  // bfdot v31.4s, v6.8h, v3.h[3]\n"
+      "ldr q6, [x20, #0x20]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      ".inst 0x4f40f088  // bfdot v8.4s, v4.8h, v0.h[0]\n"
+      ".inst 0x4f60f08b  // bfdot v11.4s, v4.8h, v0.h[1]\n"
+      "add x20, x20, #0x30\n"
+      ".inst 0x4f40f88e  // bfdot v14.4s, v4.8h, v0.h[2]\n"
+      ".inst 0x4f60f891  // bfdot v17.4s, v4.8h, v0.h[3]\n"
+      ".inst 0x4f41f094  // bfdot v20.4s, v4.8h, v1.h[0]\n"
+      ".inst 0x4f61f097  // bfdot v23.4s, v4.8h, v1.h[1]\n"
+      ".inst 0x4f41f89a  // bfdot v26.4s, v4.8h, v1.h[2]\n"
+      ".inst 0x4f61f89d  // bfdot v29.4s, v4.8h, v1.h[3]\n"
+      ".inst 0x4f40f0a9  // bfdot v9.4s, v5.8h, v0.h[0]\n"
+      ".inst 0x4f60f0ac  // bfdot v12.4s, v5.8h, v0.h[1]\n"
+      ".inst 0x4f40f8af  // bfdot v15.4s, v5.8h, v0.h[2]\n"
+      ".inst 0x4f60f8b2  // bfdot v18.4s, v5.8h, v0.h[3]\n"
+      ".inst 0x4f41f0b5  // bfdot v21.4s, v5.8h, v1.h[0]\n"
+      ".inst 0x4f61f0b8  // bfdot v24.4s, v5.8h, v1.h[1]\n"
+      ".inst 0x4f41f8bb  // bfdot v27.4s, v5.8h, v1.h[2]\n"
+      ".inst 0x4f61f8be  // bfdot v30.4s, v5.8h, v1.h[3]\n"
+      ".inst 0x4f40f0ca  // bfdot v10.4s, v6.8h, v0.h[0]\n"
+      ".inst 0x4f60f0cd  // bfdot v13.4s, v6.8h, v0.h[1]\n"
+      ".inst 0x4f40f8d0  // bfdot v16.4s, v6.8h, v0.h[2]\n"
+      ".inst 0x4f60f8d3  // bfdot v19.4s, v6.8h, v0.h[3]\n"
+      ".inst 0x4f41f0d6  // bfdot v22.4s, v6.8h, v1.h[0]\n"
+      ".inst 0x4f61f0d9  // bfdot v25.4s, v6.8h, v1.h[1]\n"
+      ".inst 0x4f41f8dc  // bfdot v28.4s, v6.8h, v1.h[2]\n"
+      ".inst 0x4f61f8df  // bfdot v31.4s, v6.8h, v1.h[3]\n"
+      "cbz x19, 5f\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ldr q7, [x20, #0x0]\n"
+      "ldr q4, [x20, #0x10]\n"
+      ".inst 0x4f40f0e8  // bfdot v8.4s, v7.8h, v0.h[0]\n"
+      "ldr q5, [x20, #0x20]\n"
+      ".inst 0x4f60f0eb  // bfdot v11.4s, v7.8h, v0.h[1]\n"
+      ".inst 0x4f40f8ee  // bfdot v14.4s, v7.8h, v0.h[2]\n"
+      "add x20, x20, #0x30\n"
+      ".inst 0x4f60f8f1  // bfdot v17.4s, v7.8h, v0.h[3]\n"
+      ".inst 0x4f41f0f4  // bfdot v20.4s, v7.8h, v1.h[0]\n"
+      ".inst 0x4f61f0f7  // bfdot v23.4s, v7.8h, v1.h[1]\n"
+      ".inst 0x4f41f8fa  // bfdot v26.4s, v7.8h, v1.h[2]\n"
+      ".inst 0x4f61f8fd  // bfdot v29.4s, v7.8h, v1.h[3]\n"
+      ".inst 0x4f40f089  // bfdot v9.4s, v4.8h, v0.h[0]\n"
+      ".inst 0x4f60f08c  // bfdot v12.4s, v4.8h, v0.h[1]\n"
+      ".inst 0x4f40f88f  // bfdot v15.4s, v4.8h, v0.h[2]\n"
+      ".inst 0x4f60f892  // bfdot v18.4s, v4.8h, v0.h[3]\n"
+      ".inst 0x4f41f095  // bfdot v21.4s, v4.8h, v1.h[0]\n"
+      ".inst 0x4f61f098  // bfdot v24.4s, v4.8h, v1.h[1]\n"
+      ".inst 0x4f41f89b  // bfdot v27.4s, v4.8h, v1.h[2]\n"
+      ".inst 0x4f61f89e  // bfdot v30.4s, v4.8h, v1.h[3]\n"
+      ".inst 0x4f40f0aa  // bfdot v10.4s, v5.8h, v0.h[0]\n"
+      ".inst 0x4f60f0ad  // bfdot v13.4s, v5.8h, v0.h[1]\n"
+      ".inst 0x4f40f8b0  // bfdot v16.4s, v5.8h, v0.h[2]\n"
+      ".inst 0x4f60f8b3  // bfdot v19.4s, v5.8h, v0.h[3]\n"
+      ".inst 0x4f41f0b6  // bfdot v22.4s, v5.8h, v1.h[0]\n"
+      ".inst 0x4f61f0b9  // bfdot v25.4s, v5.8h, v1.h[1]\n"
+      ".inst 0x4f41f8bc  // bfdot v28.4s, v5.8h, v1.h[2]\n"
+      ".inst 0x4f61f8bf  // bfdot v31.4s, v5.8h, v1.h[3]\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
 }
 
 } // namespace arm_gemm
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp
new file mode 100644
index 0000000000..304fb64891
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include "../../bfloat.hpp"
+
+namespace arm_gemm {
+
+void a64_interleaved_bf16fp32_dot_8x12_x1(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+    const bfloat16 *a_ptr = Apanel;
+    float *c_ptr = Cpanel;
+
+    K /= 2;
+    const long loops_count = (K / 2) - 1;
+    const long tails_count = K % 2;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const bfloat16 *a_ptr0 = a_ptr;
+        const bfloat16 *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            long loops = loops_count;
+            long tails = tails_count;
+
+            __asm __volatile (
+                "movi v8.4s, #0\n"
+                "ldr q0, [%[a_ptr]]\n"
+                "movi v9.4s, #0\n"
+                "ldr q2, [%[b_ptr]]\n"
+                "movi v10.4s, #0\n"
+                "ldr q3, [%[b_ptr], #0x10]\n"
+                "movi v11.4s, #0\n"
+                "add %[a_ptr], %[a_ptr], #0x20\n"
+                "movi v12.4s, #0\n"
+                "add %[b_ptr], %[b_ptr], #0x30\n"
+                "movi v13.4s, #0\n"
+                "movi v14.4s, #0\n"
+                "movi v15.4s, #0\n"
+                "movi v16.4s, #0\n"
+                "movi v17.4s, #0\n"
+                "movi v18.4s, #0\n"
+                "movi v19.4s, #0\n"
+                "movi v20.4s, #0\n"
+                "movi v21.4s, #0\n"
+                "movi v22.4s, #0\n"
+                "movi v23.4s, #0\n"
+                "movi v24.4s, #0\n"
+                "movi v25.4s, #0\n"
+                "movi v26.4s, #0\n"
+                "movi v27.4s, #0\n"
+                "movi v28.4s, #0\n"
+                "movi v29.4s, #0\n"
+                "movi v30.4s, #0\n"
+                "movi v31.4s, #0\n"
+                "cbz %[loops], 1f\n"
+                "2:\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q4, [%[b_ptr], #-0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                "ldr q1, [%[a_ptr], #-0x10]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                "subs %[loops], %[loops], #0x1\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr]]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #0x10]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr]]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "ldr q4, [%[b_ptr], #0x20]\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q1, [%[a_ptr], #0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                "add %[b_ptr], %[b_ptr], #0x60\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr], #-0x30]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #-0x20]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr], #-0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "b.ne 2b\n"
+                "1:\n"
+                "cbz %[tails], 3f\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q4, [%[b_ptr], #-0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                "ldr q1, [%[a_ptr], #-0x10]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr]]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #0x10]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr]]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "ldr q4, [%[b_ptr], #0x20]\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q1, [%[a_ptr], #0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                "add %[b_ptr], %[b_ptr], #0x60\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr], #-0x30]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #-0x20]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr], #-0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "ldr q4, [%[b_ptr], #-0x10]\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q1, [%[a_ptr], #-0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                "str q8, [%[c_ptr]]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                "str q12, [%[c_ptr], #0x10]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "str q16, [%[c_ptr], #0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                "str q9, [%[c_ptr], #0x30]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "b 4f\n"
+                "3:\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q4, [%[b_ptr], #-0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                "ldr q1, [%[a_ptr], #-0x10]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                "add %[a_ptr], %[a_ptr], #0x20\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                "add %[b_ptr], %[b_ptr], #0x30\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr], #-0x30]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #-0x20]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr], #-0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "ldr q4, [%[b_ptr], #-0x10]\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q1, [%[a_ptr], #-0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                "str q8, [%[c_ptr]]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                "str q12, [%[c_ptr], #0x10]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "str q16, [%[c_ptr], #0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                "str q9, [%[c_ptr], #0x30]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "4:\n"
+                "str q13, [%[c_ptr], #0x40]\n"
+                "str q17, [%[c_ptr], #0x50]\n"
+                "str q10, [%[c_ptr], #0x60]\n"
+                "str q14, [%[c_ptr], #0x70]\n"
+                "str q18, [%[c_ptr], #0x80]\n"
+                "str q11, [%[c_ptr], #0x90]\n"
+                "str q15, [%[c_ptr], #0xa0]\n"
+                "str q19, [%[c_ptr], #0xb0]\n"
+                "str q20, [%[c_ptr], #0xc0]\n"
+                "str q24, [%[c_ptr], #0xd0]\n"
+                "str q28, [%[c_ptr], #0xe0]\n"
+                "str q21, [%[c_ptr], #0xf0]\n"
+                "str q25, [%[c_ptr], #0x100]\n"
+                "str q29, [%[c_ptr], #0x110]\n"
+                "str q22, [%[c_ptr], #0x120]\n"
+                "str q26, [%[c_ptr], #0x130]\n"
+                "str q30, [%[c_ptr], #0x140]\n"
+                "str q23, [%[c_ptr], #0x150]\n"
+                "str q27, [%[c_ptr], #0x160]\n"
+                "str q31, [%[c_ptr], #0x170]\n"
+                "add %[c_ptr], %[c_ptr], #0x180\n"
+            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [loops] "+r" (loops), [tails] "+r" (tails)
+            :
+            : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+            );
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
index b2c2407b28..9b3517a802 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,63 +10,104 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
 
 #ifdef __aarch64__
-
-#include "../bfloat.hpp"
 #include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const bfloat16 *, const bfloat16 *, \
+    float *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_mmla_8x12( ARGLIST );
 
-class cls_a64_interleaved_bf16fp32_mmla_8x12 {
+class cls_a64_interleaved_bf16fp32_mmla_8x12
+{
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
     static unsigned int out_width()
     {
         return 12;
     }
 
-    static unsigned int out_height()
+    static unsigned int stripe_width()
     {
-        return 8;
+        return 4;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 4;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.54, 4.30, 7.33 };
+                case CPUModel::V1:
+                    return { 59.94, 5.08, 9.83 };
+                case CPUModel::A510:
+                    return { 7.82, 4.05, 3.07 };
+            }
+        }
 
-    kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12;
 
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.15, 2.51, 5.25 };
+                case CPUModel::V1:
+                    return { 59.44, 3.18, 7.26 };
+                case CPUModel::A510:
+                    return { 7.83, 2.53, 2.71 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12;
     cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
index c476fcf171..94c72a31c9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,406 +23,269 @@
  */
 #ifdef __aarch64__
 
+#include <cstddef>
 #include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const bfloat16 *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
+void a64_interleaved_bf16fp32_mmla_8x12(
+    const bfloat16 *Apanel, const bfloat16 *Bpanel,
+    float *Cpanel, int ablocks, int bblocks, int K) {
 
-    K /= 4;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const bfloat16 *a_ptr0 = a_ptr;
-        const bfloat16 *b_ptr = Bpanel;
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
+    __asm__ __volatile__(
 
-            __asm __volatile (
-                "movi v8.4s, #0\n"
-                "ldr q0, [%[a_ptr]]\n"
-                "movi v9.4s, #0\n"
-                "ldr q4, [%[b_ptr]]\n"
-                "movi v10.4s, #0\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                "movi v11.4s, #0\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                "movi v12.4s, #0\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                "movi v13.4s, #0\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                "movi v14.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x40]\n"
-                "movi v15.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x40]\n"
-                "movi v16.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x80]\n"
-                "movi v17.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x80]\n"
-                "movi v18.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0xc0]\n"
-                "movi v19.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0xc0]\n"
-                "movi v20.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x100]\n"
-                "movi v21.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x100]\n"
-                "movi v22.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x140]\n"
-                "movi v23.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x140]\n"
-                "movi v24.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x180]\n"
-                "movi v25.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x180]\n"
-                "movi v26.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n"
-                "movi v27.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x1c0]\n"
-                "movi v28.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n"
-                "movi v29.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n"
-                "movi v30.4s, #0\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x240]\n"
-                "movi v31.4s, #0\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x280]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x2c0]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x380]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x3c0]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x400]\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x440]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "add %[b_ptr], %[b_ptr], #0x40\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x2c0]\n"
-                ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x440]\n"
-                ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
-                "prfm PLDL1KEEP, [%[a_ptr], #0x300]\n"
-                ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x480]\n"
-                ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
-                "prfm PLDL1KEEP, [%[b_ptr], #0x4c0]\n"
-                ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #0x30]\n"
-                ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
-                ".inst 0x6e44ec32 // bfmmla v18.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7e // bfmmla v30.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #0x40]\n"
-                ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #0x50]\n"
-                ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-                ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
-                "add %[b_ptr], %[b_ptr], #0xc0\n"
-                ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #-0x60]\n"
-                ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7b // bfmmla v27.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #-0x50]\n"
-                ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
-                ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #-0x40]\n"
-                ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n"
-                ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n"
-                ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
-                ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #-0x30]\n"
-                ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
-                ".inst 0x6e46ec32 // bfmmla v18.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7e // bfmmla v30.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
-                ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
-                ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
-                ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
-                ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #0x30]\n"
-                ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
-                ".inst 0x6e44ec32 // bfmmla v18.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7e // bfmmla v30.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #0x40]\n"
-                ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #0x50]\n"
-                ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-                ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
-                "add %[b_ptr], %[b_ptr], #0xe0\n"
-                ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #-0x80]\n"
-                ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7b // bfmmla v27.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #-0x70]\n"
-                ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
-                ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #-0x60]\n"
-                ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n"
-                ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n"
-                ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
-                ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #-0x50]\n"
-                ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
-                ".inst 0x6e46ec32 // bfmmla v18.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7e // bfmmla v30.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #-0x40]\n"
-                ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #-0x30]\n"
-                ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
-                ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
-                ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
-                ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
-                ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
-                "uzp1 v6.2d, v14.2d, v15.2d\n"
-                ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
-                ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
-                "uzp1 v7.2d, v16.2d, v17.2d\n"
-                ".inst 0x6e44ec32 // bfmmla v18.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7e // bfmmla v30.4s, v3.8h, v4.8h\n"
-                "uzp2 v4.2d, v10.2d, v11.2d\n"
-                ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
-                "uzp1 v0.2d, v8.2d, v9.2d\n"
-                ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
-                "uzp1 v1.2d, v10.2d, v11.2d\n"
-                ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
-                "str q0, [%[c_ptr]]\n"
-                "uzp1 v2.2d, v12.2d, v13.2d\n"
-                "uzp1 v0.2d, v18.2d, v19.2d\n"
-                ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
-                "str q1, [%[c_ptr], #0x10]\n"
-                "uzp2 v3.2d, v8.2d, v9.2d\n"
-                "uzp2 v5.2d, v12.2d, v13.2d\n"
-                "uzp2 v1.2d, v14.2d, v15.2d\n"
-                "str q2, [%[c_ptr], #0x20]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
-                "add %[b_ptr], %[b_ptr], #0x80\n"
-                ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #-0x80]\n"
-                ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
-                ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
-                ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #-0x70]\n"
-                ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
-                ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #-0x60]\n"
-                ".inst 0x6e47ec0b // bfmmla v11.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e47ec31 // bfmmla v17.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec57 // bfmmla v23.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7d // bfmmla v29.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #-0x50]\n"
-                ".inst 0x6e44ec0c // bfmmla v12.4s, v0.8h, v4.8h\n"
-                ".inst 0x6e44ec32 // bfmmla v18.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec58 // bfmmla v24.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7e // bfmmla v30.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr], #-0x40]\n"
-                ".inst 0x6e45ec0d // bfmmla v13.4s, v0.8h, v5.8h\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6e45ec33 // bfmmla v19.4s, v1.8h, v5.8h\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6e45ec59 // bfmmla v25.4s, v2.8h, v5.8h\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6e45ec7f // bfmmla v31.4s, v3.8h, v5.8h\n"
-                "ldr q5, [%[b_ptr], #-0x30]\n"
-                ".inst 0x6e46ec08 // bfmmla v8.4s, v0.8h, v6.8h\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e46ec2e // bfmmla v14.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec54 // bfmmla v20.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e47ec09 // bfmmla v9.4s, v0.8h, v7.8h\n"
-                ".inst 0x6e46ec7a // bfmmla v26.4s, v3.8h, v6.8h\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e47ec2f // bfmmla v15.4s, v1.8h, v7.8h\n"
-                ".inst 0x6e47ec55 // bfmmla v21.4s, v2.8h, v7.8h\n"
-                ".inst 0x6e47ec7b // bfmmla v27.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e44ec0a // bfmmla v10.4s, v0.8h, v4.8h\n"
-                ".inst 0x6e44ec30 // bfmmla v16.4s, v1.8h, v4.8h\n"
-                ".inst 0x6e44ec56 // bfmmla v22.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7c // bfmmla v28.4s, v3.8h, v4.8h\n"
-                ".inst 0x6e45ec0b // bfmmla v11.4s, v0.8h, v5.8h\n"
-                ".inst 0x6e45ec31 // bfmmla v17.4s, v1.8h, v5.8h\n"
-                ".inst 0x6e45ec57 // bfmmla v23.4s, v2.8h, v5.8h\n"
-                ".inst 0x6e45ec7d // bfmmla v29.4s, v3.8h, v5.8h\n"
-                "uzp2 v4.2d, v10.2d, v11.2d\n"
-                ".inst 0x6e46ec0c // bfmmla v12.4s, v0.8h, v6.8h\n"
-                ".inst 0x6e46ec32 // bfmmla v18.4s, v1.8h, v6.8h\n"
-                ".inst 0x6e46ec58 // bfmmla v24.4s, v2.8h, v6.8h\n"
-                ".inst 0x6e46ec7e // bfmmla v30.4s, v3.8h, v6.8h\n"
-                "uzp1 v6.2d, v14.2d, v15.2d\n"
-                ".inst 0x6e47ec0d // bfmmla v13.4s, v0.8h, v7.8h\n"
-                "uzp1 v0.2d, v8.2d, v9.2d\n"
-                ".inst 0x6e47ec33 // bfmmla v19.4s, v1.8h, v7.8h\n"
-                "uzp1 v1.2d, v10.2d, v11.2d\n"
-                "uzp2 v5.2d, v12.2d, v13.2d\n"
-                "str q0, [%[c_ptr]]\n"
-                ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
-                "uzp1 v2.2d, v12.2d, v13.2d\n"
-                "uzp1 v0.2d, v18.2d, v19.2d\n"
-                "str q1, [%[c_ptr], #0x10]\n"
-                "uzp2 v1.2d, v14.2d, v15.2d\n"
-                ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
-                "uzp2 v3.2d, v8.2d, v9.2d\n"
-                "str q2, [%[c_ptr], #0x20]\n"
-                "uzp1 v7.2d, v16.2d, v17.2d\n"
-                "4:\n"
-                "uzp2 v2.2d, v16.2d, v17.2d\n"
-                "str q3, [%[c_ptr], #0x30]\n"
-                "uzp2 v3.2d, v18.2d, v19.2d\n"
-                "str q4, [%[c_ptr], #0x40]\n"
-                "uzp1 v4.2d, v20.2d, v21.2d\n"
-                "str q5, [%[c_ptr], #0x50]\n"
-                "uzp1 v5.2d, v22.2d, v23.2d\n"
-                "str q6, [%[c_ptr], #0x60]\n"
-                "uzp1 v6.2d, v24.2d, v25.2d\n"
-                "str q7, [%[c_ptr], #0x70]\n"
-                "uzp2 v7.2d, v20.2d, v21.2d\n"
-                "str q0, [%[c_ptr], #0x80]\n"
-                "uzp2 v0.2d, v22.2d, v23.2d\n"
-                "str q1, [%[c_ptr], #0x90]\n"
-                "uzp2 v1.2d, v24.2d, v25.2d\n"
-                "str q2, [%[c_ptr], #0xa0]\n"
-                "uzp1 v2.2d, v26.2d, v27.2d\n"
-                "str q3, [%[c_ptr], #0xb0]\n"
-                "uzp1 v3.2d, v28.2d, v29.2d\n"
-                "str q4, [%[c_ptr], #0xc0]\n"
-                "uzp1 v4.2d, v30.2d, v31.2d\n"
-                "str q5, [%[c_ptr], #0xd0]\n"
-                "uzp2 v5.2d, v26.2d, v27.2d\n"
-                "str q6, [%[c_ptr], #0xe0]\n"
-                "uzp2 v6.2d, v28.2d, v29.2d\n"
-                "str q7, [%[c_ptr], #0xf0]\n"
-                "uzp2 v7.2d, v30.2d, v31.2d\n"
-                "str q0, [%[c_ptr], #0x100]\n"
-                "str q1, [%[c_ptr], #0x110]\n"
-                "str q2, [%[c_ptr], #0x120]\n"
-                "str q3, [%[c_ptr], #0x130]\n"
-                "str q4, [%[c_ptr], #0x140]\n"
-                "str q5, [%[c_ptr], #0x150]\n"
-                "str q6, [%[c_ptr], #0x160]\n"
-                "str q7, [%[c_ptr], #0x170]\n"
-                "add %[c_ptr], %[c_ptr], #0x180\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "ldr q4, [x20, #0x0]\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "ldr q5, [x20, #0x10]\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "add x20, x20, #0x20\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ldr q3, [%x[Apanel], #0x0]\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
+      "ldr q6, [x20, #0x0]\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      "ldr q7, [x20, #0x10]\n"
+      ".inst 0x6e44ec7a  // bfmmla v26.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec7d  // bfmmla v29.4s, v3.8h, v5.8h\n"
+      "ldr q4, [x20, #0x20]\n"
+      "ldr q5, [x20, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec7b  // bfmmla v27.4s, v3.8h, v6.8h\n"
+      "ldr q6, [x20, #0x40]\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e47ec32  // bfmmla v18.4s, v1.8h, v7.8h\n"
+      "ldr q0, [%x[Apanel], #0x10]\n"
+      ".inst 0x6e44ec30  // bfmmla v16.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec33  // bfmmla v19.4s, v1.8h, v5.8h\n"
+      "ldr q1, [%x[Apanel], #0x20]\n"
+      ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec7e  // bfmmla v30.4s, v3.8h, v7.8h\n"
+      "ldr q7, [x20, #0x50]\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
+      "ldr q2, [%x[Apanel], #0x30]\n"
+      ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
+      "ldr q3, [%x[Apanel], #0x40]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      "ldr q4, [x20, #0x60]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec31  // bfmmla v17.4s, v1.8h, v7.8h\n"
+      "ldr q5, [x20, #0x70]\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x6e46ec7a  // bfmmla v26.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7d  // bfmmla v29.4s, v3.8h, v7.8h\n"
+      "ldr q6, [x20, #0x80]\n"
+      "ldr q7, [x20, #0x90]\n"
+      ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e44ec7b  // bfmmla v27.4s, v3.8h, v4.8h\n"
+      "ldr q4, [x20, #0xa0]\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
+      "ldr q0, [%x[Apanel], #0x50]\n"
+      ".inst 0x6e46ec30  // bfmmla v16.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e47ec33  // bfmmla v19.4s, v1.8h, v7.8h\n"
+      "ldr q1, [%x[Apanel], #0x60]\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec7e  // bfmmla v30.4s, v3.8h, v5.8h\n"
+      "ldr q5, [x20, #0xb0]\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec59  // bfmmla v25.4s, v2.8h, v7.8h\n"
+      "ldr q2, [%x[Apanel], #0x70]\n"
+      ".inst 0x6e46ec7c  // bfmmla v28.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7f  // bfmmla v31.4s, v3.8h, v7.8h\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "add x20, x20, #0xc0\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ldr q3, [%x[Apanel], #0x0]\n"
+      ".inst 0x6e44ec08  // bfmmla v8.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec2e  // bfmmla v14.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec0b  // bfmmla v11.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e45ec31  // bfmmla v17.4s, v1.8h, v5.8h\n"
+      "ldr q6, [x20, #0x0]\n"
+      ".inst 0x6e44ec54  // bfmmla v20.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec57  // bfmmla v23.4s, v2.8h, v5.8h\n"
+      "ldr q7, [x20, #0x10]\n"
+      ".inst 0x6e44ec7a  // bfmmla v26.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec7d  // bfmmla v29.4s, v3.8h, v5.8h\n"
+      "ldr q4, [x20, #0x20]\n"
+      "ldr q5, [x20, #0x30]\n"
+      ".inst 0x6e46ec09  // bfmmla v9.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e46ec2f  // bfmmla v15.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e46ec55  // bfmmla v21.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e46ec7b  // bfmmla v27.4s, v3.8h, v6.8h\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x6e47ec0c  // bfmmla v12.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e44ec0a  // bfmmla v10.4s, v0.8h, v4.8h\n"
+      "add x20, x20, #0x40\n"
+      ".inst 0x6e45ec0d  // bfmmla v13.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e47ec32  // bfmmla v18.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e44ec30  // bfmmla v16.4s, v1.8h, v4.8h\n"
+      ".inst 0x6e45ec33  // bfmmla v19.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e47ec58  // bfmmla v24.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e47ec7e  // bfmmla v30.4s, v3.8h, v7.8h\n"
+      ".inst 0x6e44ec56  // bfmmla v22.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e45ec59  // bfmmla v25.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e44ec7c  // bfmmla v28.4s, v3.8h, v4.8h\n"
+      ".inst 0x6e45ec7f  // bfmmla v31.4s, v3.8h, v5.8h\n"
+      "cbz x19, 5f\n"
+      "ldr q6, [x20, #0x0]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      ".inst 0x6e46ec08  // bfmmla v8.4s, v0.8h, v6.8h\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "ldr q7, [x20, #0x10]\n"
+      ".inst 0x6e46ec2e  // bfmmla v14.4s, v1.8h, v6.8h\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      "ldr q3, [%x[Apanel], #0x30]\n"
+      ".inst 0x6e47ec0b  // bfmmla v11.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e47ec31  // bfmmla v17.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e46ec54  // bfmmla v20.4s, v2.8h, v6.8h\n"
+      "ldr q4, [x20, #0x20]\n"
+      ".inst 0x6e47ec57  // bfmmla v23.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec7a  // bfmmla v26.4s, v3.8h, v6.8h\n"
+      "ldr q5, [x20, #0x30]\n"
+      ".inst 0x6e47ec7d  // bfmmla v29.4s, v3.8h, v7.8h\n"
+      "ldr q6, [x20, #0x40]\n"
+      "ldr q7, [x20, #0x50]\n"
+      ".inst 0x6e44ec09  // bfmmla v9.4s, v0.8h, v4.8h\n"
+      ".inst 0x6e44ec2f  // bfmmla v15.4s, v1.8h, v4.8h\n"
+      "add x20, x20, #0x60\n"
+      ".inst 0x6e44ec55  // bfmmla v21.4s, v2.8h, v4.8h\n"
+      ".inst 0x6e44ec7b  // bfmmla v27.4s, v3.8h, v4.8h\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x6e45ec0c  // bfmmla v12.4s, v0.8h, v5.8h\n"
+      ".inst 0x6e46ec0a  // bfmmla v10.4s, v0.8h, v6.8h\n"
+      ".inst 0x6e47ec0d  // bfmmla v13.4s, v0.8h, v7.8h\n"
+      ".inst 0x6e45ec32  // bfmmla v18.4s, v1.8h, v5.8h\n"
+      ".inst 0x6e46ec30  // bfmmla v16.4s, v1.8h, v6.8h\n"
+      ".inst 0x6e47ec33  // bfmmla v19.4s, v1.8h, v7.8h\n"
+      ".inst 0x6e45ec58  // bfmmla v24.4s, v2.8h, v5.8h\n"
+      ".inst 0x6e45ec7e  // bfmmla v30.4s, v3.8h, v5.8h\n"
+      ".inst 0x6e46ec56  // bfmmla v22.4s, v2.8h, v6.8h\n"
+      ".inst 0x6e47ec59  // bfmmla v25.4s, v2.8h, v7.8h\n"
+      ".inst 0x6e46ec7c  // bfmmla v28.4s, v3.8h, v6.8h\n"
+      ".inst 0x6e47ec7f  // bfmmla v31.4s, v3.8h, v7.8h\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q4, [%x[Cpanel], #0x0]\n"
+      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q11, [%x[Cpanel], #0x10]\n"
+      "str q12, [%x[Cpanel], #0x20]\n"
+      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q13, [%x[Cpanel], #0x60]\n"
+      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q17, [%x[Cpanel], #0x70]\n"
+      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q18, [%x[Cpanel], #0x80]\n"
+      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q19, [%x[Cpanel], #0xc0]\n"
+      "str q23, [%x[Cpanel], #0xd0]\n"
+      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q25, [%x[Cpanel], #0x120]\n"
+      "str q29, [%x[Cpanel], #0x130]\n"
+      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
 }
 
 } // namespace arm_gemm
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24.hpp
new file mode 100644
index 0000000000..ce63600424
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24.hpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    const __fp16 *, const __fp16 *, \
+    __fp16 *, int, int, int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_interleaved_fp16_mla_8x24( ARGLIST );
+void a64_interleaved_fp16_mla_8x24_a55( ARGLIST );
+void a64_interleaved_fp16_mla_8x24_x1( ARGLIST );
+
+class cls_a64_interleaved_fp16_mla_8x24
+{
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
+    static unsigned int out_width()
+    {
+        return 24;
+    }
+
+    static unsigned int stripe_width()
+    {
+        return 8;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+
+    StdTransformsFixed<operand_type, result_type, 8, 24, 1> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 24, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, __fp16>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 7.16, 1.14, 0.67 };
+                default:
+                    return { 12.67, 3.98, 1.16 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_interleaved_fp16_mla_8x24;
+    cls_a64_interleaved_fp16_mla_8x24(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A55r1:
+                kernel=a64_interleaved_fp16_mla_8x24_a55;
+                break;
+            case CPUModel::X1:
+                kernel=a64_interleaved_fp16_mla_8x24_x1;
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp
new file mode 100644
index 0000000000..49500f2d18
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/a55.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void a64_interleaved_fp16_mla_8x24_a55(
+    const __fp16 *Apanel, const __fp16 *Bpanel,
+    __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const __fp16 *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x10, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x9, %x[Apanel]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x27, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x9\n"
+      "cmp x27, #0x2\n"
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.16b, #0x0\n"
+      "prfm pldl1keep, [x28, #0x0]\n"
+      "movi v11.16b, #0x0\n"
+      "prfm pldl1keep, [x28, #0x40]\n"
+      "movi v12.16b, #0x0\n"
+      "prfm pldl1keep, [x28, #0x80]\n"
+      "movi v13.16b, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v14.16b, #0x0\n"
+      "ldr q2, [x28, #0x0]\n"
+      "movi v15.16b, #0x0\n"
+      "ldr q3, [x28, #0x10]\n"
+      "movi v16.16b, #0x0\n"
+      "ldr q4, [x28, #0x20]\n"
+      "movi v17.16b, #0x0\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ldr d1, [%x[Apanel], #0x10]\n"
+      "fmla v8.8h, v2.8h, v0.h[0]\n"
+      "ldr x26, [%x[Apanel], #0x18]\n"
+      "fmla v11.8h, v2.8h, v0.h[1]\n"
+      "ldr d5, [x28, #0x30]\n"
+      "fmla v14.8h, v2.8h, v0.h[2]\n"
+      "ldr x25, [x28, #0x38]\n"
+      "fmla v17.8h, v2.8h, v0.h[3]\n"
+      "ldr d6, [x28, #0x40]\n"
+      "fmla v20.8h, v2.8h, v0.h[4]\n"
+      "ldr x24, [x28, #0x48]\n"
+      "fmla v23.8h, v2.8h, v0.h[5]\n"
+      "ldr d7, [x28, #0x50]\n"
+      "fmla v26.8h, v2.8h, v0.h[6]\n"
+      "ldr x23, [x28, #0x58]\n"
+      "fmla v29.8h, v2.8h, v0.h[7]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla v9.8h, v3.8h, v0.h[0]\n"
+      "prfm pldl1keep, [x28, #0x100]\n"
+      "fmla v12.8h, v3.8h, v0.h[1]\n"
+      "prfm pldl1keep, [x28, #0x140]\n"
+      "fmla v15.8h, v3.8h, v0.h[2]\n"
+      "add x28, x28, #0x60\n"
+      "fmla v18.8h, v3.8h, v0.h[3]\n"
+      "ldr d2, [x28, #0x0]\n"
+      "fmla v21.8h, v3.8h, v0.h[4]\n"
+      "ldr x22, [x28, #0x8]\n"
+      "fmla v24.8h, v3.8h, v0.h[5]\n"
+      "ldr x21, [x28, #0x18]\n"
+      "fmla v27.8h, v3.8h, v0.h[6]\n"
+      "ldr x20, [%x[Apanel], #0x8]\n"
+      "fmla v30.8h, v3.8h, v0.h[7]\n"
+      "ldr d3, [x28, #0x10]\n"
+      "fmla v10.8h, v4.8h, v0.h[0]\n"
+      "ldr x19, [x28, #0x28]\n"
+      "fmla v13.8h, v4.8h, v0.h[1]\n"
+      "mov v1.d[1], x26\n"
+      "fmla v16.8h, v4.8h, v0.h[2]\n"
+      "mov v5.d[1], x25\n"
+      "fmla v19.8h, v4.8h, v0.h[3]\n"
+      "mov v6.d[1], x24\n"
+      "fmla v22.8h, v4.8h, v0.h[4]\n"
+      "mov v7.d[1], x23\n"
+      "fmla v25.8h, v4.8h, v0.h[5]\n"
+      "sub x27, x27, #0x2\n"
+      "fmla v28.8h, v4.8h, v0.h[6]\n"
+      "cmp x27, #0x2\n"
+      "fmla v31.8h, v4.8h, v0.h[7]\n"
+      "ldr d0, [%x[Apanel], #0x0]\n"
+      "ldr d4, [x28, #0x20]\n"
+      "mov v2.d[1], x22\n"
+      "mov v3.d[1], x21\n"
+      "fmla v8.8h, v5.8h, v1.h[0]\n"
+      "mov v0.d[1], x20\n"
+      "fmla v11.8h, v5.8h, v1.h[1]\n"
+      "mov v4.d[1], x19\n"
+      "fmla v14.8h, v5.8h, v1.h[2]\n"
+      "fmla v17.8h, v5.8h, v1.h[3]\n"
+      "fmla v20.8h, v5.8h, v1.h[4]\n"
+      "fmla v23.8h, v5.8h, v1.h[5]\n"
+      "fmla v26.8h, v5.8h, v1.h[6]\n"
+      "fmla v29.8h, v5.8h, v1.h[7]\n"
+      "fmla v9.8h, v6.8h, v1.h[0]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v15.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v1.h[3]\n"
+      "fmla v21.8h, v6.8h, v1.h[4]\n"
+      "fmla v24.8h, v6.8h, v1.h[5]\n"
+      "fmla v27.8h, v6.8h, v1.h[6]\n"
+      "fmla v30.8h, v6.8h, v1.h[7]\n"
+      "fmla v10.8h, v7.8h, v1.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v16.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v1.h[3]\n"
+      "fmla v22.8h, v7.8h, v1.h[4]\n"
+      "fmla v25.8h, v7.8h, v1.h[5]\n"
+      "fmla v28.8h, v7.8h, v1.h[6]\n"
+      "fmla v31.8h, v7.8h, v1.h[7]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla v8.8h, v2.8h, v0.h[0]\n"
+      "add x28, x28, #0x30\n"
+      "fmla v11.8h, v2.8h, v0.h[1]\n"
+      "fmla v14.8h, v2.8h, v0.h[2]\n"
+      "fmla v17.8h, v2.8h, v0.h[3]\n"
+      "fmla v20.8h, v2.8h, v0.h[4]\n"
+      "fmla v23.8h, v2.8h, v0.h[5]\n"
+      "fmla v26.8h, v2.8h, v0.h[6]\n"
+      "fmla v29.8h, v2.8h, v0.h[7]\n"
+      "fmla v9.8h, v3.8h, v0.h[0]\n"
+      "fmla v12.8h, v3.8h, v0.h[1]\n"
+      "fmla v15.8h, v3.8h, v0.h[2]\n"
+      "fmla v18.8h, v3.8h, v0.h[3]\n"
+      "fmla v21.8h, v3.8h, v0.h[4]\n"
+      "fmla v24.8h, v3.8h, v0.h[5]\n"
+      "fmla v27.8h, v3.8h, v0.h[6]\n"
+      "fmla v30.8h, v3.8h, v0.h[7]\n"
+      "fmla v10.8h, v4.8h, v0.h[0]\n"
+      "fmla v13.8h, v4.8h, v0.h[1]\n"
+      "fmla v16.8h, v4.8h, v0.h[2]\n"
+      "fmla v19.8h, v4.8h, v0.h[3]\n"
+      "fmla v22.8h, v4.8h, v0.h[4]\n"
+      "fmla v25.8h, v4.8h, v0.h[5]\n"
+      "fmla v28.8h, v4.8h, v0.h[6]\n"
+      "fmla v31.8h, v4.8h, v0.h[7]\n"
+      "cbz x27, 5f\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "ldr q5, [x28, #0x0]\n"
+      "fmla v8.8h, v5.8h, v0.h[0]\n"
+      "ldr q6, [x28, #0x10]\n"
+      "fmla v11.8h, v5.8h, v0.h[1]\n"
+      "ldr q7, [x28, #0x20]\n"
+      "fmla v14.8h, v5.8h, v0.h[2]\n"
+      "fmla v17.8h, v5.8h, v0.h[3]\n"
+      "add x28, x28, #0x30\n"
+      "fmla v20.8h, v5.8h, v0.h[4]\n"
+      "fmla v23.8h, v5.8h, v0.h[5]\n"
+      "fmla v26.8h, v5.8h, v0.h[6]\n"
+      "fmla v29.8h, v5.8h, v0.h[7]\n"
+      "fmla v9.8h, v6.8h, v0.h[0]\n"
+      "fmla v12.8h, v6.8h, v0.h[1]\n"
+      "fmla v15.8h, v6.8h, v0.h[2]\n"
+      "fmla v18.8h, v6.8h, v0.h[3]\n"
+      "fmla v21.8h, v6.8h, v0.h[4]\n"
+      "fmla v24.8h, v6.8h, v0.h[5]\n"
+      "fmla v27.8h, v6.8h, v0.h[6]\n"
+      "fmla v30.8h, v6.8h, v0.h[7]\n"
+      "fmla v10.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v0.h[1]\n"
+      "fmla v16.8h, v7.8h, v0.h[2]\n"
+      "fmla v19.8h, v7.8h, v0.h[3]\n"
+      "fmla v22.8h, v7.8h, v0.h[4]\n"
+      "fmla v25.8h, v7.8h, v0.h[5]\n"
+      "fmla v28.8h, v7.8h, v0.h[6]\n"
+      "fmla v31.8h, v7.8h, v0.h[7]\n"
+      "5:"  // multiply loop done
+      "subs x10, x10, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp
new file mode 100644
index 0000000000..a9da6956ed
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/generic.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void a64_interleaved_fp16_mla_8x24(
+    const __fp16 *Apanel, const __fp16 *Bpanel,
+    __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const __fp16 *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "ldr q2, [x20, #0x0]\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "ldr q3, [x20, #0x10]\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "ldr q4, [x20, #0x20]\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "fmla v8.8h, v2.8h, v0.h[0]\n"
+      "fmla v11.8h, v2.8h, v0.h[1]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "fmla v14.8h, v2.8h, v0.h[2]\n"
+      "fmla v17.8h, v2.8h, v0.h[3]\n"
+      "ldr q5, [x20, #0x30]\n"
+      "fmla v20.8h, v2.8h, v0.h[4]\n"
+      "fmla v23.8h, v2.8h, v0.h[5]\n"
+      "ldr q6, [x20, #0x40]\n"
+      "fmla v26.8h, v2.8h, v0.h[6]\n"
+      "fmla v29.8h, v2.8h, v0.h[7]\n"
+      "ldr q7, [x20, #0x50]\n"
+      "fmla v9.8h, v3.8h, v0.h[0]\n"
+      "fmla v12.8h, v3.8h, v0.h[1]\n"
+      "sub x19, x19, #0x2\n"
+      "fmla v15.8h, v3.8h, v0.h[2]\n"
+      "fmla v18.8h, v3.8h, v0.h[3]\n"
+      "cmp x19, #0x2\n"
+      "fmla v21.8h, v3.8h, v0.h[4]\n"
+      "fmla v24.8h, v3.8h, v0.h[5]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      "fmla v27.8h, v3.8h, v0.h[6]\n"
+      "fmla v30.8h, v3.8h, v0.h[7]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla v10.8h, v4.8h, v0.h[0]\n"
+      "fmla v13.8h, v4.8h, v0.h[1]\n"
+      "prfm pldl1keep, [x20, #0x100]\n"
+      "fmla v16.8h, v4.8h, v0.h[2]\n"
+      "fmla v19.8h, v4.8h, v0.h[3]\n"
+      "prfm pldl1keep, [x20, #0x140]\n"
+      "fmla v22.8h, v4.8h, v0.h[4]\n"
+      "fmla v25.8h, v4.8h, v0.h[5]\n"
+      "add x20, x20, #0x60\n"
+      "fmla v28.8h, v4.8h, v0.h[6]\n"
+      "fmla v31.8h, v4.8h, v0.h[7]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "fmla v8.8h, v5.8h, v1.h[0]\n"
+      "fmla v11.8h, v5.8h, v1.h[1]\n"
+      "ldr q2, [x20, #0x0]\n"
+      "fmla v14.8h, v5.8h, v1.h[2]\n"
+      "fmla v17.8h, v5.8h, v1.h[3]\n"
+      "ldr q3, [x20, #0x10]\n"
+      "fmla v20.8h, v5.8h, v1.h[4]\n"
+      "fmla v23.8h, v5.8h, v1.h[5]\n"
+      "ldr q4, [x20, #0x20]\n"
+      "fmla v26.8h, v5.8h, v1.h[6]\n"
+      "fmla v29.8h, v5.8h, v1.h[7]\n"
+      "fmla v9.8h, v6.8h, v1.h[0]\n"
+      "fmla v12.8h, v6.8h, v1.h[1]\n"
+      "fmla v15.8h, v6.8h, v1.h[2]\n"
+      "fmla v18.8h, v6.8h, v1.h[3]\n"
+      "fmla v21.8h, v6.8h, v1.h[4]\n"
+      "fmla v24.8h, v6.8h, v1.h[5]\n"
+      "fmla v27.8h, v6.8h, v1.h[6]\n"
+      "fmla v30.8h, v6.8h, v1.h[7]\n"
+      "fmla v10.8h, v7.8h, v1.h[0]\n"
+      "fmla v13.8h, v7.8h, v1.h[1]\n"
+      "fmla v16.8h, v7.8h, v1.h[2]\n"
+      "fmla v19.8h, v7.8h, v1.h[3]\n"
+      "fmla v22.8h, v7.8h, v1.h[4]\n"
+      "fmla v25.8h, v7.8h, v1.h[5]\n"
+      "fmla v28.8h, v7.8h, v1.h[6]\n"
+      "fmla v31.8h, v7.8h, v1.h[7]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "fmla v8.8h, v2.8h, v0.h[0]\n"
+      "fmla v11.8h, v2.8h, v0.h[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla v14.8h, v2.8h, v0.h[2]\n"
+      "fmla v17.8h, v2.8h, v0.h[3]\n"
+      "add x20, x20, #0x30\n"
+      "fmla v20.8h, v2.8h, v0.h[4]\n"
+      "fmla v23.8h, v2.8h, v0.h[5]\n"
+      "fmla v26.8h, v2.8h, v0.h[6]\n"
+      "fmla v29.8h, v2.8h, v0.h[7]\n"
+      "fmla v9.8h, v3.8h, v0.h[0]\n"
+      "fmla v12.8h, v3.8h, v0.h[1]\n"
+      "fmla v15.8h, v3.8h, v0.h[2]\n"
+      "fmla v18.8h, v3.8h, v0.h[3]\n"
+      "fmla v21.8h, v3.8h, v0.h[4]\n"
+      "fmla v24.8h, v3.8h, v0.h[5]\n"
+      "fmla v27.8h, v3.8h, v0.h[6]\n"
+      "fmla v30.8h, v3.8h, v0.h[7]\n"
+      "fmla v10.8h, v4.8h, v0.h[0]\n"
+      "fmla v13.8h, v4.8h, v0.h[1]\n"
+      "fmla v16.8h, v4.8h, v0.h[2]\n"
+      "fmla v19.8h, v4.8h, v0.h[3]\n"
+      "fmla v22.8h, v4.8h, v0.h[4]\n"
+      "fmla v25.8h, v4.8h, v0.h[5]\n"
+      "fmla v28.8h, v4.8h, v0.h[6]\n"
+      "fmla v31.8h, v4.8h, v0.h[7]\n"
+      "cbz x19, 5f\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q5, [x20, #0x0]\n"
+      "fmla v8.8h, v5.8h, v0.h[0]\n"
+      "ldr q6, [x20, #0x10]\n"
+      "ldr q7, [x20, #0x20]\n"
+      "fmla v11.8h, v5.8h, v0.h[1]\n"
+      "fmla v14.8h, v5.8h, v0.h[2]\n"
+      "fmla v17.8h, v5.8h, v0.h[3]\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla v20.8h, v5.8h, v0.h[4]\n"
+      "fmla v23.8h, v5.8h, v0.h[5]\n"
+      "add x20, x20, #0x30\n"
+      "fmla v26.8h, v5.8h, v0.h[6]\n"
+      "fmla v29.8h, v5.8h, v0.h[7]\n"
+      "fmla v9.8h, v6.8h, v0.h[0]\n"
+      "fmla v12.8h, v6.8h, v0.h[1]\n"
+      "fmla v15.8h, v6.8h, v0.h[2]\n"
+      "fmla v18.8h, v6.8h, v0.h[3]\n"
+      "fmla v21.8h, v6.8h, v0.h[4]\n"
+      "fmla v24.8h, v6.8h, v0.h[5]\n"
+      "fmla v27.8h, v6.8h, v0.h[6]\n"
+      "fmla v30.8h, v6.8h, v0.h[7]\n"
+      "fmla v10.8h, v7.8h, v0.h[0]\n"
+      "fmla v13.8h, v7.8h, v0.h[1]\n"
+      "fmla v16.8h, v7.8h, v0.h[2]\n"
+      "fmla v19.8h, v7.8h, v0.h[3]\n"
+      "fmla v22.8h, v7.8h, v0.h[4]\n"
+      "fmla v25.8h, v7.8h, v0.h[5]\n"
+      "fmla v28.8h, v7.8h, v0.h[6]\n"
+      "fmla v31.8h, v7.8h, v0.h[7]\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp
new file mode 100644
index 0000000000..efaedeb33f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp16_mla_8x24/x1.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void a64_interleaved_fp16_mla_8x24_x1(
+    const __fp16 *Apanel, const __fp16 *Bpanel,
+    __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const __fp16 *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "ldr q1, [x20, #0x0]\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "ldr q2, [x20, #0x10]\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "ldr q3, [x20, #0x20]\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "fmla v8.8h, v1.8h, v0.h[0]\n"
+      "fmla v11.8h, v1.8h, v0.h[1]\n"
+      "sub x19, x19, #0x2\n"
+      "fmla v14.8h, v1.8h, v0.h[2]\n"
+      "fmla v17.8h, v1.8h, v0.h[3]\n"
+      "cmp x19, #0x2\n"
+      "fmla v20.8h, v1.8h, v0.h[4]\n"
+      "fmla v23.8h, v1.8h, v0.h[5]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      "fmla v26.8h, v1.8h, v0.h[6]\n"
+      "fmla v29.8h, v1.8h, v0.h[7]\n"
+      "ldr q1, [x20, #0x30]\n"
+      "fmla v9.8h, v2.8h, v0.h[0]\n"
+      "fmla v12.8h, v2.8h, v0.h[1]\n"
+      "prfm pldl1keep, [x20, #0x100]\n"
+      "fmla v15.8h, v2.8h, v0.h[2]\n"
+      "fmla v18.8h, v2.8h, v0.h[3]\n"
+      "prfm pldl1keep, [x20, #0x140]\n"
+      "fmla v21.8h, v2.8h, v0.h[4]\n"
+      "fmla v24.8h, v2.8h, v0.h[5]\n"
+      "fmla v27.8h, v2.8h, v0.h[6]\n"
+      "fmla v30.8h, v2.8h, v0.h[7]\n"
+      "ldr q2, [x20, #0x40]\n"
+      "fmla v10.8h, v3.8h, v0.h[0]\n"
+      "fmla v13.8h, v3.8h, v0.h[1]\n"
+      "fmla v16.8h, v3.8h, v0.h[2]\n"
+      "fmla v19.8h, v3.8h, v0.h[3]\n"
+      "fmla v22.8h, v3.8h, v0.h[4]\n"
+      "fmla v25.8h, v3.8h, v0.h[5]\n"
+      "fmla v28.8h, v3.8h, v0.h[6]\n"
+      "fmla v31.8h, v3.8h, v0.h[7]\n"
+      "ldr q0, [%x[Apanel], #0x10]\n"
+      "ldr q3, [x20, #0x50]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "add x20, x20, #0x60\n"
+      "fmla v8.8h, v1.8h, v0.h[0]\n"
+      "fmla v11.8h, v1.8h, v0.h[1]\n"
+      "fmla v14.8h, v1.8h, v0.h[2]\n"
+      "fmla v17.8h, v1.8h, v0.h[3]\n"
+      "fmla v20.8h, v1.8h, v0.h[4]\n"
+      "fmla v23.8h, v1.8h, v0.h[5]\n"
+      "fmla v26.8h, v1.8h, v0.h[6]\n"
+      "fmla v29.8h, v1.8h, v0.h[7]\n"
+      "ldr q1, [x20, #0x0]\n"
+      "fmla v9.8h, v2.8h, v0.h[0]\n"
+      "fmla v12.8h, v2.8h, v0.h[1]\n"
+      "fmla v15.8h, v2.8h, v0.h[2]\n"
+      "fmla v18.8h, v2.8h, v0.h[3]\n"
+      "fmla v21.8h, v2.8h, v0.h[4]\n"
+      "fmla v24.8h, v2.8h, v0.h[5]\n"
+      "fmla v27.8h, v2.8h, v0.h[6]\n"
+      "fmla v30.8h, v2.8h, v0.h[7]\n"
+      "ldr q2, [x20, #0x10]\n"
+      "fmla v10.8h, v3.8h, v0.h[0]\n"
+      "fmla v13.8h, v3.8h, v0.h[1]\n"
+      "fmla v16.8h, v3.8h, v0.h[2]\n"
+      "fmla v19.8h, v3.8h, v0.h[3]\n"
+      "fmla v22.8h, v3.8h, v0.h[4]\n"
+      "fmla v25.8h, v3.8h, v0.h[5]\n"
+      "fmla v28.8h, v3.8h, v0.h[6]\n"
+      "fmla v31.8h, v3.8h, v0.h[7]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q3, [x20, #0x20]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "fmla v8.8h, v1.8h, v0.h[0]\n"
+      "fmla v11.8h, v1.8h, v0.h[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla v14.8h, v1.8h, v0.h[2]\n"
+      "fmla v17.8h, v1.8h, v0.h[3]\n"
+      "add x20, x20, #0x30\n"
+      "fmla v20.8h, v1.8h, v0.h[4]\n"
+      "fmla v23.8h, v1.8h, v0.h[5]\n"
+      "fmla v26.8h, v1.8h, v0.h[6]\n"
+      "fmla v29.8h, v1.8h, v0.h[7]\n"
+      "fmla v9.8h, v2.8h, v0.h[0]\n"
+      "fmla v12.8h, v2.8h, v0.h[1]\n"
+      "fmla v15.8h, v2.8h, v0.h[2]\n"
+      "fmla v18.8h, v2.8h, v0.h[3]\n"
+      "fmla v21.8h, v2.8h, v0.h[4]\n"
+      "fmla v24.8h, v2.8h, v0.h[5]\n"
+      "fmla v27.8h, v2.8h, v0.h[6]\n"
+      "fmla v30.8h, v2.8h, v0.h[7]\n"
+      "fmla v10.8h, v3.8h, v0.h[0]\n"
+      "fmla v13.8h, v3.8h, v0.h[1]\n"
+      "fmla v16.8h, v3.8h, v0.h[2]\n"
+      "fmla v19.8h, v3.8h, v0.h[3]\n"
+      "fmla v22.8h, v3.8h, v0.h[4]\n"
+      "fmla v25.8h, v3.8h, v0.h[5]\n"
+      "fmla v28.8h, v3.8h, v0.h[6]\n"
+      "fmla v31.8h, v3.8h, v0.h[7]\n"
+      "cbz x19, 5f\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q4, [x20, #0x0]\n"
+      "fmla v8.8h, v4.8h, v0.h[0]\n"
+      "ldr q5, [x20, #0x10]\n"
+      "ldr q6, [x20, #0x20]\n"
+      "fmla v11.8h, v4.8h, v0.h[1]\n"
+      "fmla v14.8h, v4.8h, v0.h[2]\n"
+      "fmla v17.8h, v4.8h, v0.h[3]\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla v20.8h, v4.8h, v0.h[4]\n"
+      "fmla v23.8h, v4.8h, v0.h[5]\n"
+      "add x20, x20, #0x30\n"
+      "fmla v26.8h, v4.8h, v0.h[6]\n"
+      "fmla v29.8h, v4.8h, v0.h[7]\n"
+      "fmla v9.8h, v5.8h, v0.h[0]\n"
+      "fmla v12.8h, v5.8h, v0.h[1]\n"
+      "fmla v15.8h, v5.8h, v0.h[2]\n"
+      "fmla v18.8h, v5.8h, v0.h[3]\n"
+      "fmla v21.8h, v5.8h, v0.h[4]\n"
+      "fmla v24.8h, v5.8h, v0.h[5]\n"
+      "fmla v27.8h, v5.8h, v0.h[6]\n"
+      "fmla v30.8h, v5.8h, v0.h[7]\n"
+      "fmla v10.8h, v6.8h, v0.h[0]\n"
+      "fmla v13.8h, v6.8h, v0.h[1]\n"
+      "fmla v16.8h, v6.8h, v0.h[2]\n"
+      "fmla v19.8h, v6.8h, v0.h[3]\n"
+      "fmla v22.8h, v6.8h, v0.h[4]\n"
+      "fmla v25.8h, v6.8h, v0.h[5]\n"
+      "fmla v28.8h, v6.8h, v0.h[6]\n"
+      "fmla v31.8h, v6.8h, v0.h[7]\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp
new file mode 100644
index 0000000000..465a5b4e0f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12.hpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    const float *, const float *, \
+    float *, int, int, int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_interleaved_fp32_mla_8x12( ARGLIST );
+void a64_interleaved_fp32_mla_8x12_a55( ARGLIST );
+void a64_interleaved_fp32_mla_8x12_x1( ARGLIST );
+
+class cls_a64_interleaved_fp32_mla_8x12
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
+    static unsigned int out_width()
+    {
+        return 12;
+    }
+
+    static unsigned int stripe_width()
+    {
+        return 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+
+    StdTransformsFixed<operand_type, result_type, 8, 12, 1> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 3.954, 1.252, 1.141 };
+                default:
+                    return { 7.2307, 3.876, 2.932 };
+                case CPUModel::A73:
+                    return { 2.885, 1.429, 1.163 };
+                case CPUModel::A53:
+                    return { 2.7, 0.9, 0.8 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_interleaved_fp32_mla_8x12;
+    cls_a64_interleaved_fp32_mla_8x12(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A55r1:
+            case CPUModel::A53:
+                kernel=a64_interleaved_fp32_mla_8x12_a55;
+                break;
+            case CPUModel::X1:
+                kernel=a64_interleaved_fp32_mla_8x12_x1;
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp
new file mode 100644
index 0000000000..46d9ff73b9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/a55.cpp
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void a64_interleaved_fp32_mla_8x12_a55(
+    const float *Apanel, const float *Bpanel,
+    float *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const float *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x28, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x27, %x[Apanel]\n"
+      "ldr x26, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x25, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x27\n"
+      "cmp x25, #0x4\n"
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.16b, #0x0\n"
+      "prfm pldl1keep, [x26, #0x0]\n"
+      "movi v11.16b, #0x0\n"
+      "prfm pldl1keep, [x26, #0x40]\n"
+      "movi v12.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x40]\n"
+      "movi v13.16b, #0x0\n"
+      "prfm pldl1keep, [x26, #0x80]\n"
+      "movi v14.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      "movi v15.16b, #0x0\n"
+      "prfm pldl1keep, [x26, #0xc0]\n"
+      "movi v16.16b, #0x0\n"
+      "prfm pldl1keep, [x26, #0x100]\n"
+      "movi v17.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0xc0]\n"
+      "movi v18.16b, #0x0\n"
+      "prfm pldl1keep, [x26, #0x140]\n"
+      "movi v19.16b, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v20.16b, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v21.16b, #0x0\n"
+      "ldr q4, [x26, #0x0]\n"
+      "movi v22.16b, #0x0\n"
+      "ldr q5, [x26, #0x10]\n"
+      "movi v23.16b, #0x0\n"
+      "ldr q6, [x26, #0x20]\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ldr d2, [%x[Apanel], #0x20]\n"
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "ldr x21, [%x[Apanel], #0x28]\n"
+      "fmla v11.4s, v4.4s, v0.s[1]\n"
+      "ldr d3, [%x[Apanel], #0x30]\n"
+      "fmla v14.4s, v4.4s, v0.s[2]\n"
+      "ldr x20, [%x[Apanel], #0x38]\n"
+      "fmla v17.4s, v4.4s, v0.s[3]\n"
+      "ldr d7, [x26, #0x30]\n"
+      "fmla v20.4s, v4.4s, v1.s[0]\n"
+      "ldr x24, [x26, #0x38]\n"
+      "fmla v23.4s, v4.4s, v1.s[1]\n"
+      "fmla v26.4s, v4.4s, v1.s[2]\n"
+      "ldr x23, [x26, #0x48]\n"
+      "fmla v29.4s, v4.4s, v1.s[3]\n"
+      "ldr d4, [x26, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "mov v2.d[1], x21\n"
+      "fmla v12.4s, v5.4s, v0.s[1]\n"
+      "mov v3.d[1], x20\n"
+      "fmla v15.4s, v5.4s, v0.s[2]\n"
+      "mov v7.d[1], x24\n"
+      "fmla v18.4s, v5.4s, v0.s[3]\n"
+      "mov v4.d[1], x23\n"
+      "fmla v21.4s, v5.4s, v1.s[0]\n"
+      "ldr x22, [x26, #0x58]\n"
+      "fmla v24.4s, v5.4s, v1.s[1]\n"
+      "ldr x21, [%x[Apanel], #0x48]\n"
+      "fmla v27.4s, v5.4s, v1.s[2]\n"
+      "ldr x20, [%x[Apanel], #0x58]\n"
+      "fmla v30.4s, v5.4s, v1.s[3]\n"
+      "ldr d5, [x26, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "ldr x19, [x26, #0x68]\n"
+      "fmla v13.4s, v6.4s, v0.s[1]\n"
+      "ldr x24, [x26, #0x78]\n"
+      "fmla v16.4s, v6.4s, v0.s[2]\n"
+      "mov v5.d[1], x22\n"
+      "fmla v19.4s, v6.4s, v0.s[3]\n"
+      "ldr d0, [%x[Apanel], #0x40]\n"
+      "fmla v22.4s, v6.4s, v1.s[0]\n"
+      "mov v0.d[1], x21\n"
+      "fmla v25.4s, v6.4s, v1.s[1]\n"
+      "ldr x23, [x26, #0x88]\n"
+      "fmla v28.4s, v6.4s, v1.s[2]\n"
+      "ldr x21, [%x[Apanel], #0x68]\n"
+      "fmla v31.4s, v6.4s, v1.s[3]\n"
+      "ldr d1, [%x[Apanel], #0x50]\n"
+      "ldr d6, [x26, #0x60]\n"
+      "fmla v8.4s, v7.4s, v2.s[0]\n"
+      "fmla v11.4s, v7.4s, v2.s[1]\n"
+      "mov v1.d[1], x20\n"
+      "fmla v14.4s, v7.4s, v2.s[2]\n"
+      "mov v6.d[1], x19\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "ldr x20, [%x[Apanel], #0x78]\n"
+      "fmla v20.4s, v7.4s, v3.s[0]\n"
+      "ldr x22, [x26, #0x98]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "ldr x19, [x26, #0xa8]\n"
+      "fmla v26.4s, v7.4s, v3.s[2]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x100]\n"
+      "fmla v29.4s, v7.4s, v3.s[3]\n"
+      "ldr d7, [x26, #0x70]\n"
+      "fmla v9.4s, v4.4s, v2.s[0]\n"
+      "mov v7.d[1], x24\n"
+      "fmla v12.4s, v4.4s, v2.s[1]\n"
+      "ldr x24, [x26, #0xb8]\n"
+      "fmla v15.4s, v4.4s, v2.s[2]\n"
+      "prfm pldl1keep, [x26, #0x180]\n"
+      "fmla v18.4s, v4.4s, v2.s[3]\n"
+      "prfm pldl1keep, [x26, #0x1c0]\n"
+      "fmla v21.4s, v4.4s, v3.s[0]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x140]\n"
+      "fmla v24.4s, v4.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x26, #0x200]\n"
+      "fmla v27.4s, v4.4s, v3.s[2]\n"
+      "sub x25, x25, #0x4\n"
+      "fmla v30.4s, v4.4s, v3.s[3]\n"
+      "ldr d4, [x26, #0x80]\n"
+      "fmla v10.4s, v5.4s, v2.s[0]\n"
+      "mov v4.d[1], x23\n"
+      "fmla v13.4s, v5.4s, v2.s[1]\n"
+      "cmp x25, #0x4\n"
+      "fmla v16.4s, v5.4s, v2.s[2]\n"
+      "fmla v19.4s, v5.4s, v2.s[3]\n"
+      "ldr d2, [%x[Apanel], #0x60]\n"
+      "fmla v22.4s, v5.4s, v3.s[0]\n"
+      "mov v2.d[1], x21\n"
+      "fmla v25.4s, v5.4s, v3.s[1]\n"
+      "fmla v28.4s, v5.4s, v3.s[2]\n"
+      "fmla v31.4s, v5.4s, v3.s[3]\n"
+      "ldr d3, [%x[Apanel], #0x70]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "ldr d5, [x26, #0x90]\n"
+      "fmla v11.4s, v6.4s, v0.s[1]\n"
+      "mov v3.d[1], x20\n"
+      "fmla v14.4s, v6.4s, v0.s[2]\n"
+      "mov v5.d[1], x22\n"
+      "fmla v17.4s, v6.4s, v0.s[3]\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "fmla v20.4s, v6.4s, v1.s[0]\n"
+      "ldr x21, [%x[Apanel], #0x8]\n"
+      "fmla v23.4s, v6.4s, v1.s[1]\n"
+      "ldr x20, [%x[Apanel], #0x18]\n"
+      "fmla v26.4s, v6.4s, v1.s[2]\n"
+      "fmla v29.4s, v6.4s, v1.s[3]\n"
+      "ldr d6, [x26, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "mov v6.d[1], x19\n"
+      "fmla v12.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v0.s[2]\n"
+      "fmla v18.4s, v7.4s, v0.s[3]\n"
+      "fmla v21.4s, v7.4s, v1.s[0]\n"
+      "fmla v24.4s, v7.4s, v1.s[1]\n"
+      "fmla v27.4s, v7.4s, v1.s[2]\n"
+      "fmla v30.4s, v7.4s, v1.s[3]\n"
+      "ldr d7, [x26, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[0]\n"
+      "add x26, x26, #0xc0\n"
+      "fmla v13.4s, v4.4s, v0.s[1]\n"
+      "ldr x23, [x26, #0x8]\n"
+      "fmla v16.4s, v4.4s, v0.s[2]\n"
+      "ldr x22, [x26, #0x18]\n"
+      "fmla v19.4s, v4.4s, v0.s[3]\n"
+      "ldr d0, [%x[Apanel], #0x0]\n"
+      "fmla v22.4s, v4.4s, v1.s[0]\n"
+      "ldr x19, [x26, #0x28]\n"
+      "fmla v25.4s, v4.4s, v1.s[1]\n"
+      "mov v7.d[1], x24\n"
+      "fmla v28.4s, v4.4s, v1.s[2]\n"
+      "mov v0.d[1], x21\n"
+      "fmla v31.4s, v4.4s, v1.s[3]\n"
+      "ldr d1, [%x[Apanel], #0x10]\n"
+      "fmla v8.4s, v5.4s, v2.s[0]\n"
+      "ldr d4, [x26, #0x0]\n"
+      "fmla v11.4s, v5.4s, v2.s[1]\n"
+      "mov v1.d[1], x20\n"
+      "fmla v14.4s, v5.4s, v2.s[2]\n"
+      "mov v4.d[1], x23\n"
+      "fmla v17.4s, v5.4s, v2.s[3]\n"
+      "fmla v20.4s, v5.4s, v3.s[0]\n"
+      "fmla v23.4s, v5.4s, v3.s[1]\n"
+      "fmla v26.4s, v5.4s, v3.s[2]\n"
+      "fmla v29.4s, v5.4s, v3.s[3]\n"
+      "ldr d5, [x26, #0x10]\n"
+      "fmla v9.4s, v6.4s, v2.s[0]\n"
+      "mov v5.d[1], x22\n"
+      "fmla v12.4s, v6.4s, v2.s[1]\n"
+      "fmla v15.4s, v6.4s, v2.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v21.4s, v6.4s, v3.s[0]\n"
+      "fmla v24.4s, v6.4s, v3.s[1]\n"
+      "fmla v27.4s, v6.4s, v3.s[2]\n"
+      "fmla v30.4s, v6.4s, v3.s[3]\n"
+      "ldr d6, [x26, #0x20]\n"
+      "mov v6.d[1], x19\n"
+      "fmla v10.4s, v7.4s, v2.s[0]\n"
+      "fmla v13.4s, v7.4s, v2.s[1]\n"
+      "fmla v16.4s, v7.4s, v2.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v22.4s, v7.4s, v3.s[0]\n"
+      "fmla v25.4s, v7.4s, v3.s[1]\n"
+      "fmla v28.4s, v7.4s, v3.s[2]\n"
+      "fmla v31.4s, v7.4s, v3.s[3]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "add x26, x26, #0x30\n"
+      "fmla v11.4s, v4.4s, v0.s[1]\n"
+      "fmla v14.4s, v4.4s, v0.s[2]\n"
+      "fmla v17.4s, v4.4s, v0.s[3]\n"
+      "fmla v20.4s, v4.4s, v1.s[0]\n"
+      "fmla v23.4s, v4.4s, v1.s[1]\n"
+      "fmla v26.4s, v4.4s, v1.s[2]\n"
+      "fmla v29.4s, v4.4s, v1.s[3]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v12.4s, v5.4s, v0.s[1]\n"
+      "fmla v15.4s, v5.4s, v0.s[2]\n"
+      "fmla v18.4s, v5.4s, v0.s[3]\n"
+      "fmla v21.4s, v5.4s, v1.s[0]\n"
+      "fmla v24.4s, v5.4s, v1.s[1]\n"
+      "fmla v27.4s, v5.4s, v1.s[2]\n"
+      "fmla v30.4s, v5.4s, v1.s[3]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v13.4s, v6.4s, v0.s[1]\n"
+      "fmla v16.4s, v6.4s, v0.s[2]\n"
+      "fmla v19.4s, v6.4s, v0.s[3]\n"
+      "fmla v22.4s, v6.4s, v1.s[0]\n"
+      "fmla v25.4s, v6.4s, v1.s[1]\n"
+      "fmla v28.4s, v6.4s, v1.s[2]\n"
+      "fmla v31.4s, v6.4s, v1.s[3]\n"
+      "cbz x25, 6f\n"
+      "5:"  // odd loop
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "subs x25, x25, #0x1\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ldr q7, [x26, #0x0]\n"
+      "fmla v8.4s, v7.4s, v0.s[0]\n"
+      "ldr q4, [x26, #0x10]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "ldr q5, [x26, #0x20]\n"
+      "fmla v14.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v0.s[3]\n"
+      "add x26, x26, #0x30\n"
+      "fmla v20.4s, v7.4s, v1.s[0]\n"
+      "fmla v23.4s, v7.4s, v1.s[1]\n"
+      "fmla v26.4s, v7.4s, v1.s[2]\n"
+      "fmla v29.4s, v7.4s, v1.s[3]\n"
+      "fmla v9.4s, v4.4s, v0.s[0]\n"
+      "fmla v12.4s, v4.4s, v0.s[1]\n"
+      "fmla v15.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v0.s[3]\n"
+      "fmla v21.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v1.s[1]\n"
+      "fmla v27.4s, v4.4s, v1.s[2]\n"
+      "fmla v30.4s, v4.4s, v1.s[3]\n"
+      "fmla v10.4s, v5.4s, v0.s[0]\n"
+      "fmla v13.4s, v5.4s, v0.s[1]\n"
+      "fmla v16.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v0.s[3]\n"
+      "fmla v22.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v1.s[1]\n"
+      "fmla v28.4s, v5.4s, v1.s[2]\n"
+      "fmla v31.4s, v5.4s, v1.s[3]\n"
+      "bne 5b\n"
+      "6:"  // multiply loop done
+      "subs x28, x28, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp
new file mode 100644
index 0000000000..06dc1534c1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/generic.cpp
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void a64_interleaved_fp32_mla_8x12(
+    const float *Apanel, const float *Bpanel,
+    float *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const float *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x4\n"
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x40]\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0xc0]\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x100]\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0xc0]\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x140]\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "ldr q4, [x20, #0x0]\n"
+      "ldr q5, [x20, #0x10]\n"
+      "ldr q6, [x20, #0x20]\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "fmla v11.4s, v4.4s, v0.s[1]\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      "fmla v14.4s, v4.4s, v0.s[2]\n"
+      "fmla v17.4s, v4.4s, v0.s[3]\n"
+      "ldr q3, [%x[Apanel], #0x30]\n"
+      "fmla v20.4s, v4.4s, v1.s[0]\n"
+      "fmla v23.4s, v4.4s, v1.s[1]\n"
+      "ldr q7, [x20, #0x30]\n"
+      "fmla v26.4s, v4.4s, v1.s[2]\n"
+      "fmla v29.4s, v4.4s, v1.s[3]\n"
+      "ldr q4, [x20, #0x40]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v12.4s, v5.4s, v0.s[1]\n"
+      "sub x19, x19, #0x4\n"
+      "fmla v15.4s, v5.4s, v0.s[2]\n"
+      "fmla v18.4s, v5.4s, v0.s[3]\n"
+      "cmp x19, #0x4\n"
+      "fmla v21.4s, v5.4s, v1.s[0]\n"
+      "fmla v24.4s, v5.4s, v1.s[1]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x100]\n"
+      "fmla v27.4s, v5.4s, v1.s[2]\n"
+      "fmla v30.4s, v5.4s, v1.s[3]\n"
+      "ldr q5, [x20, #0x50]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v13.4s, v6.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x20, #0x180]\n"
+      "fmla v16.4s, v6.4s, v0.s[2]\n"
+      "fmla v19.4s, v6.4s, v0.s[3]\n"
+      "ldr q0, [%x[Apanel], #0x40]\n"
+      "fmla v22.4s, v6.4s, v1.s[0]\n"
+      "fmla v25.4s, v6.4s, v1.s[1]\n"
+      "prfm pldl1keep, [x20, #0x1c0]\n"
+      "fmla v28.4s, v6.4s, v1.s[2]\n"
+      "fmla v31.4s, v6.4s, v1.s[3]\n"
+      "ldr q1, [%x[Apanel], #0x50]\n"
+      "fmla v8.4s, v7.4s, v2.s[0]\n"
+      "fmla v11.4s, v7.4s, v2.s[1]\n"
+      "ldr q6, [x20, #0x60]\n"
+      "fmla v14.4s, v7.4s, v2.s[2]\n"
+      "fmla v17.4s, v7.4s, v2.s[3]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x140]\n"
+      "fmla v20.4s, v7.4s, v3.s[0]\n"
+      "fmla v23.4s, v7.4s, v3.s[1]\n"
+      "prfm pldl1keep, [x20, #0x200]\n"
+      "fmla v26.4s, v7.4s, v3.s[2]\n"
+      "fmla v29.4s, v7.4s, v3.s[3]\n"
+      "ldr q7, [x20, #0x70]\n"
+      "fmla v9.4s, v4.4s, v2.s[0]\n"
+      "fmla v12.4s, v4.4s, v2.s[1]\n"
+      "fmla v15.4s, v4.4s, v2.s[2]\n"
+      "fmla v18.4s, v4.4s, v2.s[3]\n"
+      "fmla v21.4s, v4.4s, v3.s[0]\n"
+      "fmla v24.4s, v4.4s, v3.s[1]\n"
+      "fmla v27.4s, v4.4s, v3.s[2]\n"
+      "fmla v30.4s, v4.4s, v3.s[3]\n"
+      "ldr q4, [x20, #0x80]\n"
+      "fmla v10.4s, v5.4s, v2.s[0]\n"
+      "fmla v13.4s, v5.4s, v2.s[1]\n"
+      "fmla v16.4s, v5.4s, v2.s[2]\n"
+      "fmla v19.4s, v5.4s, v2.s[3]\n"
+      "ldr q2, [%x[Apanel], #0x60]\n"
+      "fmla v22.4s, v5.4s, v3.s[0]\n"
+      "fmla v25.4s, v5.4s, v3.s[1]\n"
+      "fmla v28.4s, v5.4s, v3.s[2]\n"
+      "fmla v31.4s, v5.4s, v3.s[3]\n"
+      "ldr q3, [%x[Apanel], #0x70]\n"
+      "fmla v8.4s, v6.4s, v0.s[0]\n"
+      "fmla v11.4s, v6.4s, v0.s[1]\n"
+      "ldr q5, [x20, #0x90]\n"
+      "fmla v14.4s, v6.4s, v0.s[2]\n"
+      "fmla v17.4s, v6.4s, v0.s[3]\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "fmla v20.4s, v6.4s, v1.s[0]\n"
+      "fmla v23.4s, v6.4s, v1.s[1]\n"
+      "fmla v26.4s, v6.4s, v1.s[2]\n"
+      "fmla v29.4s, v6.4s, v1.s[3]\n"
+      "ldr q6, [x20, #0xa0]\n"
+      "fmla v9.4s, v7.4s, v0.s[0]\n"
+      "fmla v12.4s, v7.4s, v0.s[1]\n"
+      "fmla v15.4s, v7.4s, v0.s[2]\n"
+      "fmla v18.4s, v7.4s, v0.s[3]\n"
+      "fmla v21.4s, v7.4s, v1.s[0]\n"
+      "fmla v24.4s, v7.4s, v1.s[1]\n"
+      "fmla v27.4s, v7.4s, v1.s[2]\n"
+      "fmla v30.4s, v7.4s, v1.s[3]\n"
+      "ldr q7, [x20, #0xb0]\n"
+      "fmla v10.4s, v4.4s, v0.s[0]\n"
+      "fmla v13.4s, v4.4s, v0.s[1]\n"
+      "add x20, x20, #0xc0\n"
+      "fmla v16.4s, v4.4s, v0.s[2]\n"
+      "fmla v19.4s, v4.4s, v0.s[3]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "fmla v22.4s, v4.4s, v1.s[0]\n"
+      "fmla v25.4s, v4.4s, v1.s[1]\n"
+      "fmla v28.4s, v4.4s, v1.s[2]\n"
+      "fmla v31.4s, v4.4s, v1.s[3]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "fmla v8.4s, v5.4s, v2.s[0]\n"
+      "fmla v11.4s, v5.4s, v2.s[1]\n"
+      "ldr q4, [x20, #0x0]\n"
+      "fmla v14.4s, v5.4s, v2.s[2]\n"
+      "fmla v17.4s, v5.4s, v2.s[3]\n"
+      "fmla v20.4s, v5.4s, v3.s[0]\n"
+      "fmla v23.4s, v5.4s, v3.s[1]\n"
+      "fmla v26.4s, v5.4s, v3.s[2]\n"
+      "fmla v29.4s, v5.4s, v3.s[3]\n"
+      "ldr q5, [x20, #0x10]\n"
+      "fmla v9.4s, v6.4s, v2.s[0]\n"
+      "fmla v12.4s, v6.4s, v2.s[1]\n"
+      "fmla v15.4s, v6.4s, v2.s[2]\n"
+      "fmla v18.4s, v6.4s, v2.s[3]\n"
+      "fmla v21.4s, v6.4s, v3.s[0]\n"
+      "fmla v24.4s, v6.4s, v3.s[1]\n"
+      "fmla v27.4s, v6.4s, v3.s[2]\n"
+      "fmla v30.4s, v6.4s, v3.s[3]\n"
+      "ldr q6, [x20, #0x20]\n"
+      "fmla v10.4s, v7.4s, v2.s[0]\n"
+      "fmla v13.4s, v7.4s, v2.s[1]\n"
+      "fmla v16.4s, v7.4s, v2.s[2]\n"
+      "fmla v19.4s, v7.4s, v2.s[3]\n"
+      "fmla v22.4s, v7.4s, v3.s[0]\n"
+      "fmla v25.4s, v7.4s, v3.s[1]\n"
+      "fmla v28.4s, v7.4s, v3.s[2]\n"
+      "fmla v31.4s, v7.4s, v3.s[3]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla v8.4s, v4.4s, v0.s[0]\n"
+      "fmla v11.4s, v4.4s, v0.s[1]\n"
+      "add x20, x20, #0x30\n"
+      "fmla v14.4s, v4.4s, v0.s[2]\n"
+      "fmla v17.4s, v4.4s, v0.s[3]\n"
+      "fmla v20.4s, v4.4s, v1.s[0]\n"
+      "fmla v23.4s, v4.4s, v1.s[1]\n"
+      "fmla v26.4s, v4.4s, v1.s[2]\n"
+      "fmla v29.4s, v4.4s, v1.s[3]\n"
+      "fmla v9.4s, v5.4s, v0.s[0]\n"
+      "fmla v12.4s, v5.4s, v0.s[1]\n"
+      "fmla v15.4s, v5.4s, v0.s[2]\n"
+      "fmla v18.4s, v5.4s, v0.s[3]\n"
+      "fmla v21.4s, v5.4s, v1.s[0]\n"
+      "fmla v24.4s, v5.4s, v1.s[1]\n"
+      "fmla v27.4s, v5.4s, v1.s[2]\n"
+      "fmla v30.4s, v5.4s, v1.s[3]\n"
+      "fmla v10.4s, v6.4s, v0.s[0]\n"
+      "fmla v13.4s, v6.4s, v0.s[1]\n"
+      "fmla v16.4s, v6.4s, v0.s[2]\n"
+      "fmla v19.4s, v6.4s, v0.s[3]\n"
+      "fmla v22.4s, v6.4s, v1.s[0]\n"
+      "fmla v25.4s, v6.4s, v1.s[1]\n"
+      "fmla v28.4s, v6.4s, v1.s[2]\n"
+      "fmla v31.4s, v6.4s, v1.s[3]\n"
+      "cbz x19, 6f\n"
+      "5:"  // odd loop
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "subs x19, x19, #0x1\n"
+      "ldr q7, [x20, #0x0]\n"
+      "ldr q4, [x20, #0x10]\n"
+      "fmla v8.4s, v7.4s, v0.s[0]\n"
+      "ldr q5, [x20, #0x20]\n"
+      "fmla v11.4s, v7.4s, v0.s[1]\n"
+      "fmla v14.4s, v7.4s, v0.s[2]\n"
+      "fmla v17.4s, v7.4s, v0.s[3]\n"
+      "fmla v20.4s, v7.4s, v1.s[0]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla v23.4s, v7.4s, v1.s[1]\n"
+      "fmla v26.4s, v7.4s, v1.s[2]\n"
+      "add x20, x20, #0x30\n"
+      "fmla v29.4s, v7.4s, v1.s[3]\n"
+      "fmla v9.4s, v4.4s, v0.s[0]\n"
+      "fmla v12.4s, v4.4s, v0.s[1]\n"
+      "fmla v15.4s, v4.4s, v0.s[2]\n"
+      "fmla v18.4s, v4.4s, v0.s[3]\n"
+      "fmla v21.4s, v4.4s, v1.s[0]\n"
+      "fmla v24.4s, v4.4s, v1.s[1]\n"
+      "fmla v27.4s, v4.4s, v1.s[2]\n"
+      "fmla v30.4s, v4.4s, v1.s[3]\n"
+      "fmla v10.4s, v5.4s, v0.s[0]\n"
+      "fmla v13.4s, v5.4s, v0.s[1]\n"
+      "fmla v16.4s, v5.4s, v0.s[2]\n"
+      "fmla v19.4s, v5.4s, v0.s[3]\n"
+      "fmla v22.4s, v5.4s, v1.s[0]\n"
+      "fmla v25.4s, v5.4s, v1.s[1]\n"
+      "fmla v28.4s, v5.4s, v1.s[2]\n"
+      "fmla v31.4s, v5.4s, v1.s[3]\n"
+      "bne 5b\n"
+      "6:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp
new file mode 100644
index 0000000000..8ba36cb87d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_fp32_mla_8x12/x1.cpp
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void a64_interleaved_fp32_mla_8x12_x1(
+    const float *Apanel, const float *Bpanel,
+    float *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const float *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x4\n"
+      "movi v8.16b, #0x0\n"
+      "movi v9.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.16b, #0x0\n"
+      "movi v11.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "movi v12.16b, #0x0\n"
+      "movi v13.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "movi v14.16b, #0x0\n"
+      "movi v15.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x40]\n"
+      "movi v16.16b, #0x0\n"
+      "movi v17.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "movi v18.16b, #0x0\n"
+      "movi v19.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      "movi v20.16b, #0x0\n"
+      "movi v21.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0xc0]\n"
+      "movi v22.16b, #0x0\n"
+      "movi v23.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x100]\n"
+      "movi v24.16b, #0x0\n"
+      "movi v25.16b, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0xc0]\n"
+      "movi v26.16b, #0x0\n"
+      "movi v27.16b, #0x0\n"
+      "prfm pldl1keep, [x20, #0x140]\n"
+      "movi v28.16b, #0x0\n"
+      "movi v29.16b, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v30.16b, #0x0\n"
+      "movi v31.16b, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "ldr q2, [x20, #0x0]\n"
+      "ldr q3, [x20, #0x10]\n"
+      "ldr q4, [x20, #0x20]\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "fmla v8.4s, v2.4s, v0.s[0]\n"
+      "fmla v11.4s, v2.4s, v0.s[1]\n"
+      "sub x19, x19, #0x4\n"
+      "fmla v14.4s, v2.4s, v0.s[2]\n"
+      "fmla v17.4s, v2.4s, v0.s[3]\n"
+      "cmp x19, #0x4\n"
+      "fmla v20.4s, v2.4s, v1.s[0]\n"
+      "fmla v23.4s, v2.4s, v1.s[1]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x100]\n"
+      "fmla v26.4s, v2.4s, v1.s[2]\n"
+      "fmla v29.4s, v2.4s, v1.s[3]\n"
+      "ldr q2, [x20, #0x30]\n"
+      "fmla v9.4s, v3.4s, v0.s[0]\n"
+      "fmla v12.4s, v3.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x20, #0x180]\n"
+      "fmla v15.4s, v3.4s, v0.s[2]\n"
+      "fmla v18.4s, v3.4s, v0.s[3]\n"
+      "prfm pldl1keep, [x20, #0x1c0]\n"
+      "fmla v21.4s, v3.4s, v1.s[0]\n"
+      "fmla v24.4s, v3.4s, v1.s[1]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x140]\n"
+      "fmla v27.4s, v3.4s, v1.s[2]\n"
+      "fmla v30.4s, v3.4s, v1.s[3]\n"
+      "ldr q3, [x20, #0x40]\n"
+      "fmla v10.4s, v4.4s, v0.s[0]\n"
+      "fmla v13.4s, v4.4s, v0.s[1]\n"
+      "prfm pldl1keep, [x20, #0x200]\n"
+      "fmla v16.4s, v4.4s, v0.s[2]\n"
+      "fmla v19.4s, v4.4s, v0.s[3]\n"
+      "ldr q0, [%x[Apanel], #0x20]\n"
+      "fmla v22.4s, v4.4s, v1.s[0]\n"
+      "fmla v25.4s, v4.4s, v1.s[1]\n"
+      "fmla v28.4s, v4.4s, v1.s[2]\n"
+      "fmla v31.4s, v4.4s, v1.s[3]\n"
+      "ldr q1, [%x[Apanel], #0x30]\n"
+      "ldr q4, [x20, #0x50]\n"
+      "fmla v8.4s, v2.4s, v0.s[0]\n"
+      "fmla v11.4s, v2.4s, v0.s[1]\n"
+      "fmla v14.4s, v2.4s, v0.s[2]\n"
+      "fmla v17.4s, v2.4s, v0.s[3]\n"
+      "fmla v20.4s, v2.4s, v1.s[0]\n"
+      "fmla v23.4s, v2.4s, v1.s[1]\n"
+      "fmla v26.4s, v2.4s, v1.s[2]\n"
+      "fmla v29.4s, v2.4s, v1.s[3]\n"
+      "ldr q2, [x20, #0x60]\n"
+      "fmla v9.4s, v3.4s, v0.s[0]\n"
+      "fmla v12.4s, v3.4s, v0.s[1]\n"
+      "fmla v15.4s, v3.4s, v0.s[2]\n"
+      "fmla v18.4s, v3.4s, v0.s[3]\n"
+      "fmla v21.4s, v3.4s, v1.s[0]\n"
+      "fmla v24.4s, v3.4s, v1.s[1]\n"
+      "fmla v27.4s, v3.4s, v1.s[2]\n"
+      "fmla v30.4s, v3.4s, v1.s[3]\n"
+      "ldr q3, [x20, #0x70]\n"
+      "fmla v10.4s, v4.4s, v0.s[0]\n"
+      "fmla v13.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v0.s[2]\n"
+      "fmla v19.4s, v4.4s, v0.s[3]\n"
+      "ldr q0, [%x[Apanel], #0x40]\n"
+      "fmla v22.4s, v4.4s, v1.s[0]\n"
+      "fmla v25.4s, v4.4s, v1.s[1]\n"
+      "fmla v28.4s, v4.4s, v1.s[2]\n"
+      "fmla v31.4s, v4.4s, v1.s[3]\n"
+      "ldr q1, [%x[Apanel], #0x50]\n"
+      "ldr q4, [x20, #0x80]\n"
+      "fmla v8.4s, v2.4s, v0.s[0]\n"
+      "fmla v11.4s, v2.4s, v0.s[1]\n"
+      "fmla v14.4s, v2.4s, v0.s[2]\n"
+      "fmla v17.4s, v2.4s, v0.s[3]\n"
+      "fmla v20.4s, v2.4s, v1.s[0]\n"
+      "fmla v23.4s, v2.4s, v1.s[1]\n"
+      "fmla v26.4s, v2.4s, v1.s[2]\n"
+      "fmla v29.4s, v2.4s, v1.s[3]\n"
+      "ldr q2, [x20, #0x90]\n"
+      "fmla v9.4s, v3.4s, v0.s[0]\n"
+      "fmla v12.4s, v3.4s, v0.s[1]\n"
+      "fmla v15.4s, v3.4s, v0.s[2]\n"
+      "fmla v18.4s, v3.4s, v0.s[3]\n"
+      "fmla v21.4s, v3.4s, v1.s[0]\n"
+      "fmla v24.4s, v3.4s, v1.s[1]\n"
+      "fmla v27.4s, v3.4s, v1.s[2]\n"
+      "fmla v30.4s, v3.4s, v1.s[3]\n"
+      "ldr q3, [x20, #0xa0]\n"
+      "fmla v10.4s, v4.4s, v0.s[0]\n"
+      "fmla v13.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v0.s[2]\n"
+      "fmla v19.4s, v4.4s, v0.s[3]\n"
+      "ldr q0, [%x[Apanel], #0x60]\n"
+      "fmla v22.4s, v4.4s, v1.s[0]\n"
+      "fmla v25.4s, v4.4s, v1.s[1]\n"
+      "fmla v28.4s, v4.4s, v1.s[2]\n"
+      "fmla v31.4s, v4.4s, v1.s[3]\n"
+      "ldr q1, [%x[Apanel], #0x70]\n"
+      "ldr q4, [x20, #0xb0]\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "add x20, x20, #0xc0\n"
+      "fmla v8.4s, v2.4s, v0.s[0]\n"
+      "fmla v11.4s, v2.4s, v0.s[1]\n"
+      "fmla v14.4s, v2.4s, v0.s[2]\n"
+      "fmla v17.4s, v2.4s, v0.s[3]\n"
+      "fmla v20.4s, v2.4s, v1.s[0]\n"
+      "fmla v23.4s, v2.4s, v1.s[1]\n"
+      "fmla v26.4s, v2.4s, v1.s[2]\n"
+      "fmla v29.4s, v2.4s, v1.s[3]\n"
+      "ldr q2, [x20, #0x0]\n"
+      "fmla v9.4s, v3.4s, v0.s[0]\n"
+      "fmla v12.4s, v3.4s, v0.s[1]\n"
+      "fmla v15.4s, v3.4s, v0.s[2]\n"
+      "fmla v18.4s, v3.4s, v0.s[3]\n"
+      "fmla v21.4s, v3.4s, v1.s[0]\n"
+      "fmla v24.4s, v3.4s, v1.s[1]\n"
+      "fmla v27.4s, v3.4s, v1.s[2]\n"
+      "fmla v30.4s, v3.4s, v1.s[3]\n"
+      "ldr q3, [x20, #0x10]\n"
+      "fmla v10.4s, v4.4s, v0.s[0]\n"
+      "fmla v13.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v0.s[2]\n"
+      "fmla v19.4s, v4.4s, v0.s[3]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "fmla v22.4s, v4.4s, v1.s[0]\n"
+      "fmla v25.4s, v4.4s, v1.s[1]\n"
+      "fmla v28.4s, v4.4s, v1.s[2]\n"
+      "fmla v31.4s, v4.4s, v1.s[3]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "ldr q4, [x20, #0x20]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla v8.4s, v2.4s, v0.s[0]\n"
+      "fmla v11.4s, v2.4s, v0.s[1]\n"
+      "add x20, x20, #0x30\n"
+      "fmla v14.4s, v2.4s, v0.s[2]\n"
+      "fmla v17.4s, v2.4s, v0.s[3]\n"
+      "fmla v20.4s, v2.4s, v1.s[0]\n"
+      "fmla v23.4s, v2.4s, v1.s[1]\n"
+      "fmla v26.4s, v2.4s, v1.s[2]\n"
+      "fmla v29.4s, v2.4s, v1.s[3]\n"
+      "fmla v9.4s, v3.4s, v0.s[0]\n"
+      "fmla v12.4s, v3.4s, v0.s[1]\n"
+      "fmla v15.4s, v3.4s, v0.s[2]\n"
+      "fmla v18.4s, v3.4s, v0.s[3]\n"
+      "fmla v21.4s, v3.4s, v1.s[0]\n"
+      "fmla v24.4s, v3.4s, v1.s[1]\n"
+      "fmla v27.4s, v3.4s, v1.s[2]\n"
+      "fmla v30.4s, v3.4s, v1.s[3]\n"
+      "fmla v10.4s, v4.4s, v0.s[0]\n"
+      "fmla v13.4s, v4.4s, v0.s[1]\n"
+      "fmla v16.4s, v4.4s, v0.s[2]\n"
+      "fmla v19.4s, v4.4s, v0.s[3]\n"
+      "fmla v22.4s, v4.4s, v1.s[0]\n"
+      "fmla v25.4s, v4.4s, v1.s[1]\n"
+      "fmla v28.4s, v4.4s, v1.s[2]\n"
+      "fmla v31.4s, v4.4s, v1.s[3]\n"
+      "cbz x19, 6f\n"
+      "5:"  // odd loop
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "subs x19, x19, #0x1\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x20, #0x10]\n"
+      "fmla v8.4s, v5.4s, v0.s[0]\n"
+      "ldr q7, [x20, #0x20]\n"
+      "fmla v11.4s, v5.4s, v0.s[1]\n"
+      "fmla v14.4s, v5.4s, v0.s[2]\n"
+      "fmla v17.4s, v5.4s, v0.s[3]\n"
+      "fmla v20.4s, v5.4s, v1.s[0]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla v23.4s, v5.4s, v1.s[1]\n"
+      "fmla v26.4s, v5.4s, v1.s[2]\n"
+      "add x20, x20, #0x30\n"
+      "fmla v29.4s, v5.4s, v1.s[3]\n"
+      "fmla v9.4s, v6.4s, v0.s[0]\n"
+      "fmla v12.4s, v6.4s, v0.s[1]\n"
+      "fmla v15.4s, v6.4s, v0.s[2]\n"
+      "fmla v18.4s, v6.4s, v0.s[3]\n"
+      "fmla v21.4s, v6.4s, v1.s[0]\n"
+      "fmla v24.4s, v6.4s, v1.s[1]\n"
+      "fmla v27.4s, v6.4s, v1.s[2]\n"
+      "fmla v30.4s, v6.4s, v1.s[3]\n"
+      "fmla v10.4s, v7.4s, v0.s[0]\n"
+      "fmla v13.4s, v7.4s, v0.s[1]\n"
+      "fmla v16.4s, v7.4s, v0.s[2]\n"
+      "fmla v19.4s, v7.4s, v0.s[3]\n"
+      "fmla v22.4s, v7.4s, v1.s[0]\n"
+      "fmla v25.4s, v7.4s, v1.s[1]\n"
+      "fmla v28.4s, v7.4s, v1.s[2]\n"
+      "fmla v31.4s, v7.4s, v1.s[3]\n"
+      "bne 5b\n"
+      "6:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp
new file mode 100644
index 0000000000..bc6b9931e1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12.hpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    const int8_t *, const int8_t *, \
+    int32_t *, int, int, int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_interleaved_s8s32_dot_8x12( ARGLIST );
+void a64_interleaved_s8s32_dot_8x12_a55( ARGLIST );
+void a64_interleaved_s8s32_dot_8x12_x1( ARGLIST );
+
+class cls_a64_interleaved_s8s32_dot_8x12
+{
+public:
+    typedef int8_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
+    static unsigned int out_width()
+    {
+        return 12;
+    }
+
+    static unsigned int stripe_width()
+    {
+        return 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+
+    StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 15.361, 0.9341, 0.1636 };
+                default:
+                    return { 29.0698, 3.9793, 0.4003 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_interleaved_s8s32_dot_8x12;
+    cls_a64_interleaved_s8s32_dot_8x12(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A55r1:
+                kernel=a64_interleaved_s8s32_dot_8x12_a55;
+                break;
+            case CPUModel::X1:
+                kernel=a64_interleaved_s8s32_dot_8x12_x1;
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp
new file mode 100644
index 0000000000..3acd61c88c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/a55.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_s8s32_dot_8x12_a55(
+    const int8_t *Apanel, const int8_t *Bpanel,
+    int32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const int8_t *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x27, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x26, %x[Apanel]\n"
+      "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x24, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x26\n"
+      "cmp x24, #0x2\n"
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.4s, #0x0\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "movi v11.4s, #0x0\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "movi v12.4s, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x40]\n"
+      "movi v13.4s, #0x0\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "movi v14.4s, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v15.4s, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v16.4s, #0x0\n"
+      "ldr q4, [x25, #0x0]\n"
+      "movi v17.4s, #0x0\n"
+      "ldr q5, [x25, #0x10]\n"
+      "movi v18.4s, #0x0\n"
+      "ldr q6, [x25, #0x20]\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      ".inst 0x4f80e088  // sdot v8.4s, v4.16b, v0.4b[0]\n"
+      "ldr d2, [%x[Apanel], #0x20]\n"
+      "ldr x23, [%x[Apanel], #0x28]\n"
+      ".inst 0x4fa0e08b  // sdot v11.4s, v4.16b, v0.4b[1]\n"
+      "ldr d3, [%x[Apanel], #0x30]\n"
+      ".inst 0x4f80e88e  // sdot v14.4s, v4.16b, v0.4b[2]\n"
+      "ldr x19, [%x[Apanel], #0x38]\n"
+      ".inst 0x4fa0e891  // sdot v17.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr x22, [x25, #0x38]\n"
+      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
+      "ldr x20, [x25, #0x48]\n"
+      ".inst 0x4f81e89a  // sdot v26.4s, v4.16b, v1.4b[2]\n"
+      "ldr x21, [x25, #0x58]\n"
+      ".inst 0x4fa1e89d  // sdot v29.4s, v4.16b, v1.4b[3]\n"
+      "ldr d4, [x25, #0x30]\n"
+      ".inst 0x4f80e0a9  // sdot v9.4s, v5.16b, v0.4b[0]\n"
+      "mov v2.d[1], x23\n"
+      ".inst 0x4fa0e0ac  // sdot v12.4s, v5.16b, v0.4b[1]\n"
+      "mov v3.d[1], x19\n"
+      ".inst 0x4f80e8af  // sdot v15.4s, v5.16b, v0.4b[2]\n"
+      "mov v4.d[1], x22\n"
+      ".inst 0x4fa0e8b2  // sdot v18.4s, v5.16b, v0.4b[3]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x4fa1e0b8  // sdot v24.4s, v5.16b, v1.4b[1]\n"
+      "prfm pldl1keep, [x25, #0x100]\n"
+      ".inst 0x4f81e8bb  // sdot v27.4s, v5.16b, v1.4b[2]\n"
+      "prfm pldl1keep, [x25, #0x140]\n"
+      ".inst 0x4fa1e8be  // sdot v30.4s, v5.16b, v1.4b[3]\n"
+      "ldr d5, [x25, #0x40]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      "mov v5.d[1], x20\n"
+      ".inst 0x4fa0e0cd  // sdot v13.4s, v6.16b, v0.4b[1]\n"
+      "ldr x20, [%x[Apanel], #0x8]\n"
+      ".inst 0x4f80e8d0  // sdot v16.4s, v6.16b, v0.4b[2]\n"
+      "ldr x19, [%x[Apanel], #0x18]\n"
+      ".inst 0x4fa0e8d3  // sdot v19.4s, v6.16b, v0.4b[3]\n"
+      "ldr d0, [%x[Apanel], #0x0]\n"
+      ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
+      "sub x24, x24, #0x2\n"
+      ".inst 0x4fa1e0d9  // sdot v25.4s, v6.16b, v1.4b[1]\n"
+      "cmp x24, #0x2\n"
+      ".inst 0x4f81e8dc  // sdot v28.4s, v6.16b, v1.4b[2]\n"
+      "mov v0.d[1], x20\n"
+      ".inst 0x4fa1e8df  // sdot v31.4s, v6.16b, v1.4b[3]\n"
+      "ldr d6, [x25, #0x50]\n"
+      "mov v6.d[1], x21\n"
+      "add x25, x25, #0x60\n"
+      ".inst 0x4f82e088  // sdot v8.4s, v4.16b, v2.4b[0]\n"
+      "ldr d1, [%x[Apanel], #0x10]\n"
+      ".inst 0x4fa2e08b  // sdot v11.4s, v4.16b, v2.4b[1]\n"
+      "ldr x22, [x25, #0x8]\n"
+      ".inst 0x4f82e88e  // sdot v14.4s, v4.16b, v2.4b[2]\n"
+      "ldr x20, [x25, #0x18]\n"
+      ".inst 0x4fa2e891  // sdot v17.4s, v4.16b, v2.4b[3]\n"
+      "ldr x21, [x25, #0x28]\n"
+      ".inst 0x4f83e094  // sdot v20.4s, v4.16b, v3.4b[0]\n"
+      "mov v1.d[1], x19\n"
+      ".inst 0x4fa3e097  // sdot v23.4s, v4.16b, v3.4b[1]\n"
+      ".inst 0x4f83e89a  // sdot v26.4s, v4.16b, v3.4b[2]\n"
+      ".inst 0x4fa3e89d  // sdot v29.4s, v4.16b, v3.4b[3]\n"
+      "ldr d4, [x25, #0x0]\n"
+      ".inst 0x4f82e0a9  // sdot v9.4s, v5.16b, v2.4b[0]\n"
+      "mov v4.d[1], x22\n"
+      ".inst 0x4fa2e0ac  // sdot v12.4s, v5.16b, v2.4b[1]\n"
+      ".inst 0x4f82e8af  // sdot v15.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x4fa2e8b2  // sdot v18.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x4f83e0b5  // sdot v21.4s, v5.16b, v3.4b[0]\n"
+      ".inst 0x4fa3e0b8  // sdot v24.4s, v5.16b, v3.4b[1]\n"
+      ".inst 0x4f83e8bb  // sdot v27.4s, v5.16b, v3.4b[2]\n"
+      ".inst 0x4fa3e8be  // sdot v30.4s, v5.16b, v3.4b[3]\n"
+      "ldr d5, [x25, #0x10]\n"
+      ".inst 0x4f82e0ca  // sdot v10.4s, v6.16b, v2.4b[0]\n"
+      "mov v5.d[1], x20\n"
+      ".inst 0x4fa2e0cd  // sdot v13.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4fa2e8d3  // sdot v19.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4fa3e0d9  // sdot v25.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4f83e8dc  // sdot v28.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4fa3e8df  // sdot v31.4s, v6.16b, v3.4b[3]\n"
+      "ldr d6, [x25, #0x20]\n"
+      "mov v6.d[1], x21\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      ".inst 0x4f80e088  // sdot v8.4s, v4.16b, v0.4b[0]\n"
+      "add x25, x25, #0x30\n"
+      ".inst 0x4fa0e08b  // sdot v11.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4f80e88e  // sdot v14.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e891  // sdot v17.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4f81e89a  // sdot v26.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e89d  // sdot v29.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4f80e0a9  // sdot v9.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e0ac  // sdot v12.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x4f80e8af  // sdot v15.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e8b2  // sdot v18.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e0b8  // sdot v24.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x4f81e8bb  // sdot v27.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e8be  // sdot v30.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e0cd  // sdot v13.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4f80e8d0  // sdot v16.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e8d3  // sdot v19.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e0d9  // sdot v25.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4f81e8dc  // sdot v28.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e8df  // sdot v31.4s, v6.16b, v1.4b[3]\n"
+      "cbz x24, 5f\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ldr q7, [x25, #0x0]\n"
+      ".inst 0x4f80e0e8  // sdot v8.4s, v7.16b, v0.4b[0]\n"
+      "ldr q4, [x25, #0x10]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q5, [x25, #0x20]\n"
+      ".inst 0x4f80e8ee  // sdot v14.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e8f1  // sdot v17.4s, v7.16b, v0.4b[3]\n"
+      "add x25, x25, #0x30\n"
+      ".inst 0x4f81e0f4  // sdot v20.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e0f7  // sdot v23.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4f81e8fa  // sdot v26.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e8fd  // sdot v29.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4f80e089  // sdot v9.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e08c  // sdot v12.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4f80e88f  // sdot v15.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e098  // sdot v24.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4f81e89b  // sdot v27.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e89e  // sdot v30.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4f80e0aa  // sdot v10.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e0ad  // sdot v13.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e0b9  // sdot v25.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x4f81e8bc  // sdot v28.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e8bf  // sdot v31.4s, v5.16b, v1.4b[3]\n"
+      "5:"  // multiply loop done
+      "subs x27, x27, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp
new file mode 100644
index 0000000000..267f62ae8a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/generic.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_s8s32_dot_8x12(
+    const int8_t *Apanel, const int8_t *Bpanel,
+    int32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const int8_t *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x40]\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "ldr q4, [x20, #0x0]\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "ldr q5, [x20, #0x10]\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "ldr q6, [x20, #0x20]\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      ".inst 0x4f80e088  // sdot v8.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e08b  // sdot v11.4s, v4.16b, v0.4b[1]\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      ".inst 0x4f80e88e  // sdot v14.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e891  // sdot v17.4s, v4.16b, v0.4b[3]\n"
+      "ldr q3, [%x[Apanel], #0x30]\n"
+      ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x4f81e89a  // sdot v26.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e89d  // sdot v29.4s, v4.16b, v1.4b[3]\n"
+      "ldr q4, [x20, #0x30]\n"
+      ".inst 0x4f80e0a9  // sdot v9.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e0ac  // sdot v12.4s, v5.16b, v0.4b[1]\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x4f80e8af  // sdot v15.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e8b2  // sdot v18.4s, v5.16b, v0.4b[3]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e0b8  // sdot v24.4s, v5.16b, v1.4b[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x4f81e8bb  // sdot v27.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e8be  // sdot v30.4s, v5.16b, v1.4b[3]\n"
+      "ldr q5, [x20, #0x40]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e0cd  // sdot v13.4s, v6.16b, v0.4b[1]\n"
+      "prfm pldl1keep, [x20, #0x100]\n"
+      ".inst 0x4f80e8d0  // sdot v16.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e8d3  // sdot v19.4s, v6.16b, v0.4b[3]\n"
+      "prfm pldl1keep, [x20, #0x140]\n"
+      ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e0d9  // sdot v25.4s, v6.16b, v1.4b[1]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      ".inst 0x4f81e8dc  // sdot v28.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e8df  // sdot v31.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x20, #0x50]\n"
+      "add x20, x20, #0x60\n"
+      ".inst 0x4f82e088  // sdot v8.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x4fa2e08b  // sdot v11.4s, v4.16b, v2.4b[1]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      ".inst 0x4f82e88e  // sdot v14.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x4fa2e891  // sdot v17.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x4f83e094  // sdot v20.4s, v4.16b, v3.4b[0]\n"
+      ".inst 0x4fa3e097  // sdot v23.4s, v4.16b, v3.4b[1]\n"
+      ".inst 0x4f83e89a  // sdot v26.4s, v4.16b, v3.4b[2]\n"
+      ".inst 0x4fa3e89d  // sdot v29.4s, v4.16b, v3.4b[3]\n"
+      "ldr q4, [x20, #0x0]\n"
+      ".inst 0x4f82e0a9  // sdot v9.4s, v5.16b, v2.4b[0]\n"
+      ".inst 0x4fa2e0ac  // sdot v12.4s, v5.16b, v2.4b[1]\n"
+      ".inst 0x4f82e8af  // sdot v15.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x4fa2e8b2  // sdot v18.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x4f83e0b5  // sdot v21.4s, v5.16b, v3.4b[0]\n"
+      ".inst 0x4fa3e0b8  // sdot v24.4s, v5.16b, v3.4b[1]\n"
+      ".inst 0x4f83e8bb  // sdot v27.4s, v5.16b, v3.4b[2]\n"
+      ".inst 0x4fa3e8be  // sdot v30.4s, v5.16b, v3.4b[3]\n"
+      "ldr q5, [x20, #0x10]\n"
+      ".inst 0x4f82e0ca  // sdot v10.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x4fa2e0cd  // sdot v13.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x4f82e8d0  // sdot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x4fa2e8d3  // sdot v19.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x4f83e0d6  // sdot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x4fa3e0d9  // sdot v25.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x4f83e8dc  // sdot v28.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x4fa3e8df  // sdot v31.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x20, #0x20]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      ".inst 0x4f80e088  // sdot v8.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e08b  // sdot v11.4s, v4.16b, v0.4b[1]\n"
+      "add x20, x20, #0x30\n"
+      ".inst 0x4f80e88e  // sdot v14.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e891  // sdot v17.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4f81e094  // sdot v20.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e097  // sdot v23.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4f81e89a  // sdot v26.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e89d  // sdot v29.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4f80e0a9  // sdot v9.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e0ac  // sdot v12.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x4f80e8af  // sdot v15.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e8b2  // sdot v18.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4f81e0b5  // sdot v21.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e0b8  // sdot v24.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x4f81e8bb  // sdot v27.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e8be  // sdot v30.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4f80e0ca  // sdot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e0cd  // sdot v13.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4f80e8d0  // sdot v16.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e8d3  // sdot v19.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4f81e0d6  // sdot v22.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e0d9  // sdot v25.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4f81e8dc  // sdot v28.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e8df  // sdot v31.4s, v6.16b, v1.4b[3]\n"
+      "cbz x19, 5f\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ldr q7, [x20, #0x0]\n"
+      "ldr q4, [x20, #0x10]\n"
+      ".inst 0x4f80e0e8  // sdot v8.4s, v7.16b, v0.4b[0]\n"
+      "ldr q5, [x20, #0x20]\n"
+      ".inst 0x4fa0e0eb  // sdot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4f80e8ee  // sdot v14.4s, v7.16b, v0.4b[2]\n"
+      "add x20, x20, #0x30\n"
+      ".inst 0x4fa0e8f1  // sdot v17.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4f81e0f4  // sdot v20.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e0f7  // sdot v23.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4f81e8fa  // sdot v26.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e8fd  // sdot v29.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x4f80e089  // sdot v9.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e08c  // sdot v12.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4f80e88f  // sdot v15.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e892  // sdot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4f81e095  // sdot v21.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e098  // sdot v24.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4f81e89b  // sdot v27.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e89e  // sdot v30.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x4f80e0aa  // sdot v10.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e0ad  // sdot v13.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x4f80e8b0  // sdot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e8b3  // sdot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4f81e0b6  // sdot v22.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e0b9  // sdot v25.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x4f81e8bc  // sdot v28.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e8bf  // sdot v31.4s, v5.16b, v1.4b[3]\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp
new file mode 100644
index 0000000000..4804c059a3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_dot_8x12/x1.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_s8s32_dot_8x12_x1(
+    const int8_t *Apanel, const int8_t *Bpanel,
+    int32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const int8_t *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x40]\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "ldr q2, [x20, #0x0]\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "ldr q3, [x20, #0x10]\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "ldr q4, [x20, #0x20]\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      ".inst 0x4f80e048  // sdot v8.4s, v2.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e04b  // sdot v11.4s, v2.16b, v0.4b[1]\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x4f80e84e  // sdot v14.4s, v2.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e851  // sdot v17.4s, v2.16b, v0.4b[3]\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x4f81e054  // sdot v20.4s, v2.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e057  // sdot v23.4s, v2.16b, v1.4b[1]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      ".inst 0x4f81e85a  // sdot v26.4s, v2.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e85d  // sdot v29.4s, v2.16b, v1.4b[3]\n"
+      "ldr q2, [x20, #0x30]\n"
+      ".inst 0x4f80e069  // sdot v9.4s, v3.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e06c  // sdot v12.4s, v3.16b, v0.4b[1]\n"
+      "prfm pldl1keep, [x20, #0x100]\n"
+      ".inst 0x4f80e86f  // sdot v15.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e872  // sdot v18.4s, v3.16b, v0.4b[3]\n"
+      "prfm pldl1keep, [x20, #0x140]\n"
+      ".inst 0x4f81e075  // sdot v21.4s, v3.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e078  // sdot v24.4s, v3.16b, v1.4b[1]\n"
+      ".inst 0x4f81e87b  // sdot v27.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e87e  // sdot v30.4s, v3.16b, v1.4b[3]\n"
+      "ldr q3, [x20, #0x40]\n"
+      ".inst 0x4f80e08a  // sdot v10.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e08d  // sdot v13.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4f80e890  // sdot v16.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e893  // sdot v19.4s, v4.16b, v0.4b[3]\n"
+      "ldr q0, [%x[Apanel], #0x20]\n"
+      ".inst 0x4f81e096  // sdot v22.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e099  // sdot v25.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4f81e89c  // sdot v28.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e89f  // sdot v31.4s, v4.16b, v1.4b[3]\n"
+      "ldr q1, [%x[Apanel], #0x30]\n"
+      "ldr q4, [x20, #0x50]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "add x20, x20, #0x60\n"
+      ".inst 0x4f80e048  // sdot v8.4s, v2.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e04b  // sdot v11.4s, v2.16b, v0.4b[1]\n"
+      ".inst 0x4f80e84e  // sdot v14.4s, v2.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e851  // sdot v17.4s, v2.16b, v0.4b[3]\n"
+      ".inst 0x4f81e054  // sdot v20.4s, v2.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e057  // sdot v23.4s, v2.16b, v1.4b[1]\n"
+      ".inst 0x4f81e85a  // sdot v26.4s, v2.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e85d  // sdot v29.4s, v2.16b, v1.4b[3]\n"
+      "ldr q2, [x20, #0x0]\n"
+      ".inst 0x4f80e069  // sdot v9.4s, v3.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e06c  // sdot v12.4s, v3.16b, v0.4b[1]\n"
+      ".inst 0x4f80e86f  // sdot v15.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e872  // sdot v18.4s, v3.16b, v0.4b[3]\n"
+      ".inst 0x4f81e075  // sdot v21.4s, v3.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e078  // sdot v24.4s, v3.16b, v1.4b[1]\n"
+      ".inst 0x4f81e87b  // sdot v27.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e87e  // sdot v30.4s, v3.16b, v1.4b[3]\n"
+      "ldr q3, [x20, #0x10]\n"
+      ".inst 0x4f80e08a  // sdot v10.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e08d  // sdot v13.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4f80e890  // sdot v16.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e893  // sdot v19.4s, v4.16b, v0.4b[3]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      ".inst 0x4f81e096  // sdot v22.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e099  // sdot v25.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4f81e89c  // sdot v28.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e89f  // sdot v31.4s, v4.16b, v1.4b[3]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "ldr q4, [x20, #0x20]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      ".inst 0x4f80e048  // sdot v8.4s, v2.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e04b  // sdot v11.4s, v2.16b, v0.4b[1]\n"
+      "add x20, x20, #0x30\n"
+      ".inst 0x4f80e84e  // sdot v14.4s, v2.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e851  // sdot v17.4s, v2.16b, v0.4b[3]\n"
+      ".inst 0x4f81e054  // sdot v20.4s, v2.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e057  // sdot v23.4s, v2.16b, v1.4b[1]\n"
+      ".inst 0x4f81e85a  // sdot v26.4s, v2.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e85d  // sdot v29.4s, v2.16b, v1.4b[3]\n"
+      ".inst 0x4f80e069  // sdot v9.4s, v3.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e06c  // sdot v12.4s, v3.16b, v0.4b[1]\n"
+      ".inst 0x4f80e86f  // sdot v15.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e872  // sdot v18.4s, v3.16b, v0.4b[3]\n"
+      ".inst 0x4f81e075  // sdot v21.4s, v3.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e078  // sdot v24.4s, v3.16b, v1.4b[1]\n"
+      ".inst 0x4f81e87b  // sdot v27.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e87e  // sdot v30.4s, v3.16b, v1.4b[3]\n"
+      ".inst 0x4f80e08a  // sdot v10.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e08d  // sdot v13.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x4f80e890  // sdot v16.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e893  // sdot v19.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x4f81e096  // sdot v22.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e099  // sdot v25.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x4f81e89c  // sdot v28.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e89f  // sdot v31.4s, v4.16b, v1.4b[3]\n"
+      "cbz x19, 5f\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x20, #0x10]\n"
+      ".inst 0x4f80e0a8  // sdot v8.4s, v5.16b, v0.4b[0]\n"
+      "ldr q7, [x20, #0x20]\n"
+      ".inst 0x4fa0e0ab  // sdot v11.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x4f80e8ae  // sdot v14.4s, v5.16b, v0.4b[2]\n"
+      "add x20, x20, #0x30\n"
+      ".inst 0x4fa0e8b1  // sdot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x4f81e0b4  // sdot v20.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e0b7  // sdot v23.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x4f81e8ba  // sdot v26.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e8bd  // sdot v29.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x4f80e0c9  // sdot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e0cc  // sdot v12.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x4f80e8cf  // sdot v15.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e8d2  // sdot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x4f81e0d5  // sdot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e0d8  // sdot v24.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x4f81e8db  // sdot v27.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e8de  // sdot v30.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x4f80e0ea  // sdot v10.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x4fa0e0ed  // sdot v13.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x4f80e8f0  // sdot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x4fa0e8f3  // sdot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x4f81e0f6  // sdot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x4fa1e0f9  // sdot v25.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x4f81e8fc  // sdot v28.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x4fa1e8ff  // sdot v31.4s, v7.16b, v1.4b[3]\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
index b17b76f170..ff69bc8f53 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,64 +10,103 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
 
 #ifdef __aarch64__
-
-#include <cstdint>
 #include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const int8_t *, const int8_t *, \
+    int32_t *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void a64_interleaved_s8s32_mmla_8x12(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_interleaved_s8s32_mmla_8x12( ARGLIST );
 
-class cls_a64_interleaved_s8s32_mmla_8x12 {
+class cls_a64_interleaved_s8s32_mmla_8x12
+{
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
 
-    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
     static unsigned int out_width()
     {
         return 12;
     }
 
-    static unsigned int out_height()
+    static unsigned int stripe_width()
     {
-        return 8;
+        return 4;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 8;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
     StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=a64_interleaved_s8s32_mmla_8x12;
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 62.57, 4.08, 8.01 };
+                case CPUModel::A510:
+                    return { 48.25, 3.53, 3.71 };
+                case CPUModel::V1:
+                    return { 117.02, 4.98, 10.87 };
+            }
+        }
+
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 62.53, 3.70, 0.50 };
+                case CPUModel::A510:
+                    return { 48.22, 2.49, 0.29 };
+                case CPUModel::V1:
+                    return { 116.76, 4.67, 0.60 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
+    // Default to the generic kernel
+    kern_type kernel=a64_interleaved_s8s32_mmla_8x12;
     cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
index 2093e75b8e..0c2722a1c2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,373 +23,340 @@
  */
 #ifdef __aarch64__
 
+#include <cstddef>
 #include <cstdint>
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void a64_interleaved_s8s32_mmla_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const int8_t *a_ptr = Apanel;
-    int32_t *c_ptr = Cpanel;
+void a64_interleaved_s8s32_mmla_8x12(
+    const int8_t *Apanel, const int8_t *Bpanel,
+    int32_t *Cpanel, int ablocks, int bblocks, int K) {
 
-    K /= 8;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const int8_t *Bpanel = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const int8_t *a_ptr0 = a_ptr;
-        const int8_t *b_ptr = Bpanel;
+    ka.bblocks = bblocks;
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
+    __asm__ __volatile__(
 
-            __asm __volatile (
-                "movi v8.4s, #0\n"
-                "ldr q0, [%[a_ptr]]\n"
-                "movi v9.4s, #0\n"
-                "ldr q4, [%[b_ptr]]\n"
-                "movi v10.4s, #0\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                "movi v11.4s, #0\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                "movi v12.4s, #0\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                "movi v13.4s, #0\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                "movi v14.4s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "movi v15.4s, #0\n"
-                "add %[b_ptr], %[b_ptr], #0x40\n"
-                "movi v16.4s, #0\n"
-                "movi v17.4s, #0\n"
-                "movi v18.4s, #0\n"
-                "movi v19.4s, #0\n"
-                "movi v20.4s, #0\n"
-                "movi v21.4s, #0\n"
-                "movi v22.4s, #0\n"
-                "movi v23.4s, #0\n"
-                "movi v24.4s, #0\n"
-                "movi v25.4s, #0\n"
-                "movi v26.4s, #0\n"
-                "movi v27.4s, #0\n"
-                "movi v28.4s, #0\n"
-                "movi v29.4s, #0\n"
-                "movi v30.4s, #0\n"
-                "movi v31.4s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
-                ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #0x30]\n"
-                ".inst 0x4e84a40c // smmla v12.4s, v0.16b, v4.16b\n"
-                ".inst 0x4e84a432 // smmla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a458 // smmla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #0x40]\n"
-                ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #0x50]\n"
-                ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-                ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
-                "add %[b_ptr], %[b_ptr], #0xc0\n"
-                ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x60]\n"
-                ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e87a42f // smmla v15.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47b // smmla v27.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x50]\n"
-                ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
-                ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x40]\n"
-                ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n"
-                ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a432 // smmla v18.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a458 // smmla v24.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47e // smmla v30.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
-                ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #0x30]\n"
-                ".inst 0x4e84a40c // smmla v12.4s, v0.16b, v4.16b\n"
-                ".inst 0x4e84a432 // smmla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a458 // smmla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #0x40]\n"
-                ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #0x50]\n"
-                ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-                ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
-                "add %[b_ptr], %[b_ptr], #0xe0\n"
-                ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x80]\n"
-                ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e87a42f // smmla v15.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47b // smmla v27.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x70]\n"
-                ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
-                ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x60]\n"
-                ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n"
-                ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x50]\n"
-                ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a432 // smmla v18.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a458 // smmla v24.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47e // smmla v30.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x40]\n"
-                ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
-                ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
-                "uzp1 v6.2d, v14.2d, v15.2d\n"
-                ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
-                ".inst 0x4e84a40c // smmla v12.4s, v0.16b, v4.16b\n"
-                "uzp1 v7.2d, v16.2d, v17.2d\n"
-                ".inst 0x4e84a432 // smmla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a458 // smmla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n"
-                "uzp2 v4.2d, v10.2d, v11.2d\n"
-                ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
-                "uzp1 v0.2d, v8.2d, v9.2d\n"
-                ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
-                "uzp1 v1.2d, v10.2d, v11.2d\n"
-                ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
-                "str q0, [%[c_ptr]]\n"
-                "uzp1 v2.2d, v12.2d, v13.2d\n"
-                "uzp1 v0.2d, v18.2d, v19.2d\n"
-                ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
-                "str q1, [%[c_ptr], #0x10]\n"
-                "uzp2 v3.2d, v8.2d, v9.2d\n"
-                "uzp2 v5.2d, v12.2d, v13.2d\n"
-                "uzp2 v1.2d, v14.2d, v15.2d\n"
-                "str q2, [%[c_ptr], #0x20]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
-                "add %[b_ptr], %[b_ptr], #0x80\n"
-                ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x80]\n"
-                ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x70]\n"
-                ".inst 0x4e86a40a // smmla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a430 // smmla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a456 // smmla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47c // smmla v28.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x60]\n"
-                ".inst 0x4e87a40b // smmla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e87a431 // smmla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a457 // smmla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47d // smmla v29.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x50]\n"
-                ".inst 0x4e84a40c // smmla v12.4s, v0.16b, v4.16b\n"
-                ".inst 0x4e84a432 // smmla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a458 // smmla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47e // smmla v30.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x40]\n"
-                ".inst 0x4e85a40d // smmla v13.4s, v0.16b, v5.16b\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x4e85a433 // smmla v19.4s, v1.16b, v5.16b\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x4e85a459 // smmla v25.4s, v2.16b, v5.16b\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4e85a47f // smmla v31.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x30]\n"
-                ".inst 0x4e86a408 // smmla v8.4s, v0.16b, v6.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4e86a42e // smmla v14.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a454 // smmla v20.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e87a409 // smmla v9.4s, v0.16b, v7.16b\n"
-                ".inst 0x4e86a47a // smmla v26.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                ".inst 0x4e87a42f // smmla v15.4s, v1.16b, v7.16b\n"
-                ".inst 0x4e87a455 // smmla v21.4s, v2.16b, v7.16b\n"
-                ".inst 0x4e87a47b // smmla v27.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x4e84a40a // smmla v10.4s, v0.16b, v4.16b\n"
-                ".inst 0x4e84a430 // smmla v16.4s, v1.16b, v4.16b\n"
-                ".inst 0x4e84a456 // smmla v22.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47c // smmla v28.4s, v3.16b, v4.16b\n"
-                ".inst 0x4e85a40b // smmla v11.4s, v0.16b, v5.16b\n"
-                ".inst 0x4e85a431 // smmla v17.4s, v1.16b, v5.16b\n"
-                ".inst 0x4e85a457 // smmla v23.4s, v2.16b, v5.16b\n"
-                ".inst 0x4e85a47d // smmla v29.4s, v3.16b, v5.16b\n"
-                "uzp2 v4.2d, v10.2d, v11.2d\n"
-                ".inst 0x4e86a40c // smmla v12.4s, v0.16b, v6.16b\n"
-                ".inst 0x4e86a432 // smmla v18.4s, v1.16b, v6.16b\n"
-                ".inst 0x4e86a458 // smmla v24.4s, v2.16b, v6.16b\n"
-                ".inst 0x4e86a47e // smmla v30.4s, v3.16b, v6.16b\n"
-                "uzp1 v6.2d, v14.2d, v15.2d\n"
-                ".inst 0x4e87a40d // smmla v13.4s, v0.16b, v7.16b\n"
-                "uzp1 v0.2d, v8.2d, v9.2d\n"
-                ".inst 0x4e87a433 // smmla v19.4s, v1.16b, v7.16b\n"
-                "uzp1 v1.2d, v10.2d, v11.2d\n"
-                "uzp2 v5.2d, v12.2d, v13.2d\n"
-                "str q0, [%[c_ptr]]\n"
-                ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
-                "uzp1 v2.2d, v12.2d, v13.2d\n"
-                "uzp1 v0.2d, v18.2d, v19.2d\n"
-                "str q1, [%[c_ptr], #0x10]\n"
-                "uzp2 v1.2d, v14.2d, v15.2d\n"
-                ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
-                "uzp2 v3.2d, v8.2d, v9.2d\n"
-                "str q2, [%[c_ptr], #0x20]\n"
-                "uzp1 v7.2d, v16.2d, v17.2d\n"
-                "4:\n"
-                "uzp2 v2.2d, v16.2d, v17.2d\n"
-                "str q3, [%[c_ptr], #0x30]\n"
-                "uzp2 v3.2d, v18.2d, v19.2d\n"
-                "str q4, [%[c_ptr], #0x40]\n"
-                "uzp1 v4.2d, v20.2d, v21.2d\n"
-                "str q5, [%[c_ptr], #0x50]\n"
-                "uzp1 v5.2d, v22.2d, v23.2d\n"
-                "str q6, [%[c_ptr], #0x60]\n"
-                "uzp1 v6.2d, v24.2d, v25.2d\n"
-                "str q7, [%[c_ptr], #0x70]\n"
-                "uzp2 v7.2d, v20.2d, v21.2d\n"
-                "str q0, [%[c_ptr], #0x80]\n"
-                "uzp2 v0.2d, v22.2d, v23.2d\n"
-                "str q1, [%[c_ptr], #0x90]\n"
-                "uzp2 v1.2d, v24.2d, v25.2d\n"
-                "str q2, [%[c_ptr], #0xa0]\n"
-                "uzp1 v2.2d, v26.2d, v27.2d\n"
-                "str q3, [%[c_ptr], #0xb0]\n"
-                "uzp1 v3.2d, v28.2d, v29.2d\n"
-                "str q4, [%[c_ptr], #0xc0]\n"
-                "uzp1 v4.2d, v30.2d, v31.2d\n"
-                "str q5, [%[c_ptr], #0xd0]\n"
-                "uzp2 v5.2d, v26.2d, v27.2d\n"
-                "str q6, [%[c_ptr], #0xe0]\n"
-                "uzp2 v6.2d, v28.2d, v29.2d\n"
-                "str q7, [%[c_ptr], #0xf0]\n"
-                "uzp2 v7.2d, v30.2d, v31.2d\n"
-                "str q0, [%[c_ptr], #0x100]\n"
-                "str q1, [%[c_ptr], #0x110]\n"
-                "str q2, [%[c_ptr], #0x120]\n"
-                "str q3, [%[c_ptr], #0x130]\n"
-                "str q4, [%[c_ptr], #0x140]\n"
-                "str q5, [%[c_ptr], #0x150]\n"
-                "str q6, [%[c_ptr], #0x160]\n"
-                "str q7, [%[c_ptr], #0x170]\n"
-                "add %[c_ptr], %[c_ptr], #0x180\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x4\n"
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "ldr q4, [x20, #0x0]\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "ldr q5, [x20, #0x10]\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "add x20, x20, #0x20\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ldr q3, [%x[Apanel], #0x0]\n"
+      ".inst 0x4e84a408  // smmla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a42e  // smmla v14.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a40b  // smmla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a431  // smmla v17.4s, v1.16b, v5.16b\n"
+      "ldr q6, [x20, #0x0]\n"
+      ".inst 0x4e84a454  // smmla v20.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
+      "ldr q7, [x20, #0x10]\n"
+      ".inst 0x4e84a47a  // smmla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47d  // smmla v29.4s, v3.16b, v5.16b\n"
+      "ldr q4, [x20, #0x20]\n"
+      "ldr q5, [x20, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a47b  // smmla v27.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x20, #0x40]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e84a40a  // smmla v10.4s, v0.16b, v4.16b\n"
+      "sub x19, x19, #0x4\n"
+      ".inst 0x4e85a40d  // smmla v13.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e87a432  // smmla v18.4s, v1.16b, v7.16b\n"
+      "ldr q0, [%x[Apanel], #0x10]\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      "ldr q1, [%x[Apanel], #0x20]\n"
+      ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a47e  // smmla v30.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x20, #0x50]\n"
+      ".inst 0x4e84a456  // smmla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
+      "ldr q2, [%x[Apanel], #0x30]\n"
+      ".inst 0x4e84a47c  // smmla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47f  // smmla v31.4s, v3.16b, v5.16b\n"
+      "ldr q3, [%x[Apanel], #0x40]\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      "ldr q4, [x20, #0x60]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a431  // smmla v17.4s, v1.16b, v7.16b\n"
+      "ldr q5, [x20, #0x70]\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      "cmp x19, #0x4\n"
+      ".inst 0x4e86a47a  // smmla v26.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47d  // smmla v29.4s, v3.16b, v7.16b\n"
+      "ldr q6, [x20, #0x80]\n"
+      "ldr q7, [x20, #0x90]\n"
+      ".inst 0x4e84a409  // smmla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a42f  // smmla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a455  // smmla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e84a47b  // smmla v27.4s, v3.16b, v4.16b\n"
+      "ldr q4, [x20, #0xa0]\n"
+      ".inst 0x4e85a40c  // smmla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e85a432  // smmla v18.4s, v1.16b, v5.16b\n"
+      "ldr q0, [%x[Apanel], #0x50]\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e87a433  // smmla v19.4s, v1.16b, v7.16b\n"
+      "ldr q1, [%x[Apanel], #0x60]\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e85a47e  // smmla v30.4s, v3.16b, v5.16b\n"
+      "ldr q5, [x20, #0xb0]\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      "ldr q2, [%x[Apanel], #0x70]\n"
+      ".inst 0x4e86a47c  // smmla v28.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47f  // smmla v31.4s, v3.16b, v7.16b\n"
+      "ldr q3, [%x[Apanel], #0x80]\n"
+      ".inst 0x4e84a408  // smmla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a42e  // smmla v14.4s, v1.16b, v4.16b\n"
+      "ldr q6, [x20, #0xc0]\n"
+      ".inst 0x4e85a40b  // smmla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a431  // smmla v17.4s, v1.16b, v5.16b\n"
+      "ldr q7, [x20, #0xd0]\n"
+      ".inst 0x4e84a454  // smmla v20.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a47a  // smmla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47d  // smmla v29.4s, v3.16b, v5.16b\n"
+      "ldr q4, [x20, #0xe0]\n"
+      "ldr q5, [x20, #0xf0]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a47b  // smmla v27.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x20, #0x100]\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e84a40a  // smmla v10.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e85a40d  // smmla v13.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e87a432  // smmla v18.4s, v1.16b, v7.16b\n"
+      "ldr q0, [%x[Apanel], #0x90]\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      "ldr q1, [%x[Apanel], #0xa0]\n"
+      ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a47e  // smmla v30.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x20, #0x110]\n"
+      ".inst 0x4e84a456  // smmla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
+      "ldr q2, [%x[Apanel], #0xb0]\n"
+      ".inst 0x4e84a47c  // smmla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47f  // smmla v31.4s, v3.16b, v5.16b\n"
+      "ldr q3, [%x[Apanel], #0xc0]\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      "ldr q4, [x20, #0x120]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a431  // smmla v17.4s, v1.16b, v7.16b\n"
+      "ldr q5, [x20, #0x130]\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a47a  // smmla v26.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47d  // smmla v29.4s, v3.16b, v7.16b\n"
+      "ldr q6, [x20, #0x140]\n"
+      "ldr q7, [x20, #0x150]\n"
+      ".inst 0x4e84a409  // smmla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a42f  // smmla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a455  // smmla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e84a47b  // smmla v27.4s, v3.16b, v4.16b\n"
+      "ldr q4, [x20, #0x160]\n"
+      ".inst 0x4e85a40c  // smmla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e85a432  // smmla v18.4s, v1.16b, v5.16b\n"
+      "ldr q0, [%x[Apanel], #0xd0]\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e87a433  // smmla v19.4s, v1.16b, v7.16b\n"
+      "ldr q1, [%x[Apanel], #0xe0]\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e85a47e  // smmla v30.4s, v3.16b, v5.16b\n"
+      "ldr q5, [x20, #0x170]\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      "ldr q2, [%x[Apanel], #0xf0]\n"
+      ".inst 0x4e86a47c  // smmla v28.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47f  // smmla v31.4s, v3.16b, v7.16b\n"
+      "add %x[Apanel], %x[Apanel], #0x100\n"
+      "add x20, x20, #0x180\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ldr q3, [%x[Apanel], #0x0]\n"
+      ".inst 0x4e84a408  // smmla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a42e  // smmla v14.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a40b  // smmla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e85a431  // smmla v17.4s, v1.16b, v5.16b\n"
+      "ldr q6, [x20, #0x0]\n"
+      ".inst 0x4e84a454  // smmla v20.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a457  // smmla v23.4s, v2.16b, v5.16b\n"
+      "ldr q7, [x20, #0x10]\n"
+      ".inst 0x4e84a47a  // smmla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47d  // smmla v29.4s, v3.16b, v5.16b\n"
+      "ldr q4, [x20, #0x20]\n"
+      "ldr q5, [x20, #0x30]\n"
+      ".inst 0x4e86a409  // smmla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x4e86a42f  // smmla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e86a455  // smmla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e86a47b  // smmla v27.4s, v3.16b, v6.16b\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x4e87a40c  // smmla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e84a40a  // smmla v10.4s, v0.16b, v4.16b\n"
+      "add x20, x20, #0x40\n"
+      ".inst 0x4e85a40d  // smmla v13.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e87a432  // smmla v18.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e84a430  // smmla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e85a433  // smmla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e87a458  // smmla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e87a47e  // smmla v30.4s, v3.16b, v7.16b\n"
+      ".inst 0x4e84a456  // smmla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e85a459  // smmla v25.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e84a47c  // smmla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x4e85a47f  // smmla v31.4s, v3.16b, v5.16b\n"
+      "cbz x19, 6f\n"
+      "5:"  // odd loop
+      "ldr q6, [x20, #0x0]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      ".inst 0x4e86a408  // smmla v8.4s, v0.16b, v6.16b\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "ldr q7, [x20, #0x10]\n"
+      ".inst 0x4e86a42e  // smmla v14.4s, v1.16b, v6.16b\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      "ldr q3, [%x[Apanel], #0x30]\n"
+      ".inst 0x4e87a40b  // smmla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e87a431  // smmla v17.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e86a454  // smmla v20.4s, v2.16b, v6.16b\n"
+      "ldr q4, [x20, #0x20]\n"
+      ".inst 0x4e87a457  // smmla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a47a  // smmla v26.4s, v3.16b, v6.16b\n"
+      "ldr q5, [x20, #0x30]\n"
+      ".inst 0x4e87a47d  // smmla v29.4s, v3.16b, v7.16b\n"
+      "ldr q6, [x20, #0x40]\n"
+      "ldr q7, [x20, #0x50]\n"
+      "subs x19, x19, #0x1\n"
+      ".inst 0x4e84a409  // smmla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x4e84a42f  // smmla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x4e84a455  // smmla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x4e84a47b  // smmla v27.4s, v3.16b, v4.16b\n"
+      "add x20, x20, #0x60\n"
+      ".inst 0x4e85a40c  // smmla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x4e86a40a  // smmla v10.4s, v0.16b, v6.16b\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x4e87a40d  // smmla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x4e85a432  // smmla v18.4s, v1.16b, v5.16b\n"
+      ".inst 0x4e86a430  // smmla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x4e87a433  // smmla v19.4s, v1.16b, v7.16b\n"
+      ".inst 0x4e85a458  // smmla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x4e85a47e  // smmla v30.4s, v3.16b, v5.16b\n"
+      ".inst 0x4e86a456  // smmla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x4e87a459  // smmla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x4e86a47c  // smmla v28.4s, v3.16b, v6.16b\n"
+      ".inst 0x4e87a47f  // smmla v31.4s, v3.16b, v7.16b\n"
+      "bne 5b\n"
+      "6:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q4, [%x[Cpanel], #0x0]\n"
+      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q11, [%x[Cpanel], #0x10]\n"
+      "str q12, [%x[Cpanel], #0x20]\n"
+      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q13, [%x[Cpanel], #0x60]\n"
+      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q17, [%x[Cpanel], #0x70]\n"
+      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q18, [%x[Cpanel], #0x80]\n"
+      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q19, [%x[Cpanel], #0xc0]\n"
+      "str q23, [%x[Cpanel], #0xd0]\n"
+      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q25, [%x[Cpanel], #0x120]\n"
+      "str q29, [%x[Cpanel], #0x130]\n"
+      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
 }
 
 } // namespace arm_gemm
-
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12.hpp
new file mode 100644
index 0000000000..000cc680da
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12.hpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+#include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    const uint8_t *, const uint8_t *, \
+    uint32_t *, int, int, int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_interleaved_u8u32_dot_8x12( ARGLIST );
+void a64_interleaved_u8u32_dot_8x12_a55( ARGLIST );
+void a64_interleaved_u8u32_dot_8x12_x1( ARGLIST );
+
+class cls_a64_interleaved_u8u32_dot_8x12
+{
+public:
+    typedef uint8_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
+    static unsigned int out_width()
+    {
+        return 12;
+    }
+
+    static unsigned int stripe_width()
+    {
+        return 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+
+    StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+    StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 15.361, 0.9341, 0.1636 };
+                default:
+                    return { 29.0698, 3.9793, 0.4003 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=a64_interleaved_u8u32_dot_8x12;
+    cls_a64_interleaved_u8u32_dot_8x12(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A55r1:
+                kernel=a64_interleaved_u8u32_dot_8x12_a55;
+                break;
+            case CPUModel::X1:
+                kernel=a64_interleaved_u8u32_dot_8x12_x1;
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp
new file mode 100644
index 0000000000..7892306153
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/a55.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_u8u32_dot_8x12_a55(
+    const uint8_t *Apanel, const uint8_t *Bpanel,
+    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x27, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x26, %x[Apanel]\n"
+      "ldr x25, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x24, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x26\n"
+      "cmp x24, #0x2\n"
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.4s, #0x0\n"
+      "prfm pldl1keep, [x25, #0x0]\n"
+      "movi v11.4s, #0x0\n"
+      "prfm pldl1keep, [x25, #0x40]\n"
+      "movi v12.4s, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x40]\n"
+      "movi v13.4s, #0x0\n"
+      "prfm pldl1keep, [x25, #0x80]\n"
+      "movi v14.4s, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v15.4s, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v16.4s, #0x0\n"
+      "ldr q4, [x25, #0x0]\n"
+      "movi v17.4s, #0x0\n"
+      "ldr q5, [x25, #0x10]\n"
+      "movi v18.4s, #0x0\n"
+      "ldr q6, [x25, #0x20]\n"
+      "movi v19.4s, #0x0\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      ".inst 0x6f80e088  // udot v8.4s, v4.16b, v0.4b[0]\n"
+      "ldr d2, [%x[Apanel], #0x20]\n"
+      "ldr x23, [%x[Apanel], #0x28]\n"
+      ".inst 0x6fa0e08b  // udot v11.4s, v4.16b, v0.4b[1]\n"
+      "ldr d3, [%x[Apanel], #0x30]\n"
+      ".inst 0x6f80e88e  // udot v14.4s, v4.16b, v0.4b[2]\n"
+      "ldr x19, [%x[Apanel], #0x38]\n"
+      ".inst 0x6fa0e891  // udot v17.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
+      "ldr x22, [x25, #0x38]\n"
+      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
+      "ldr x20, [x25, #0x48]\n"
+      ".inst 0x6f81e89a  // udot v26.4s, v4.16b, v1.4b[2]\n"
+      "ldr x21, [x25, #0x58]\n"
+      ".inst 0x6fa1e89d  // udot v29.4s, v4.16b, v1.4b[3]\n"
+      "ldr d4, [x25, #0x30]\n"
+      ".inst 0x6f80e0a9  // udot v9.4s, v5.16b, v0.4b[0]\n"
+      "mov v2.d[1], x23\n"
+      ".inst 0x6fa0e0ac  // udot v12.4s, v5.16b, v0.4b[1]\n"
+      "mov v3.d[1], x19\n"
+      ".inst 0x6f80e8af  // udot v15.4s, v5.16b, v0.4b[2]\n"
+      "mov v4.d[1], x22\n"
+      ".inst 0x6fa0e8b2  // udot v18.4s, v5.16b, v0.4b[3]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x6fa1e0b8  // udot v24.4s, v5.16b, v1.4b[1]\n"
+      "prfm pldl1keep, [x25, #0x100]\n"
+      ".inst 0x6f81e8bb  // udot v27.4s, v5.16b, v1.4b[2]\n"
+      "prfm pldl1keep, [x25, #0x140]\n"
+      ".inst 0x6fa1e8be  // udot v30.4s, v5.16b, v1.4b[3]\n"
+      "ldr d5, [x25, #0x40]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      "mov v5.d[1], x20\n"
+      ".inst 0x6fa0e0cd  // udot v13.4s, v6.16b, v0.4b[1]\n"
+      "ldr x20, [%x[Apanel], #0x8]\n"
+      ".inst 0x6f80e8d0  // udot v16.4s, v6.16b, v0.4b[2]\n"
+      "ldr x19, [%x[Apanel], #0x18]\n"
+      ".inst 0x6fa0e8d3  // udot v19.4s, v6.16b, v0.4b[3]\n"
+      "ldr d0, [%x[Apanel], #0x0]\n"
+      ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
+      "sub x24, x24, #0x2\n"
+      ".inst 0x6fa1e0d9  // udot v25.4s, v6.16b, v1.4b[1]\n"
+      "cmp x24, #0x2\n"
+      ".inst 0x6f81e8dc  // udot v28.4s, v6.16b, v1.4b[2]\n"
+      "mov v0.d[1], x20\n"
+      ".inst 0x6fa1e8df  // udot v31.4s, v6.16b, v1.4b[3]\n"
+      "ldr d6, [x25, #0x50]\n"
+      "mov v6.d[1], x21\n"
+      "add x25, x25, #0x60\n"
+      ".inst 0x6f82e088  // udot v8.4s, v4.16b, v2.4b[0]\n"
+      "ldr d1, [%x[Apanel], #0x10]\n"
+      ".inst 0x6fa2e08b  // udot v11.4s, v4.16b, v2.4b[1]\n"
+      "ldr x22, [x25, #0x8]\n"
+      ".inst 0x6f82e88e  // udot v14.4s, v4.16b, v2.4b[2]\n"
+      "ldr x20, [x25, #0x18]\n"
+      ".inst 0x6fa2e891  // udot v17.4s, v4.16b, v2.4b[3]\n"
+      "ldr x21, [x25, #0x28]\n"
+      ".inst 0x6f83e094  // udot v20.4s, v4.16b, v3.4b[0]\n"
+      "mov v1.d[1], x19\n"
+      ".inst 0x6fa3e097  // udot v23.4s, v4.16b, v3.4b[1]\n"
+      ".inst 0x6f83e89a  // udot v26.4s, v4.16b, v3.4b[2]\n"
+      ".inst 0x6fa3e89d  // udot v29.4s, v4.16b, v3.4b[3]\n"
+      "ldr d4, [x25, #0x0]\n"
+      ".inst 0x6f82e0a9  // udot v9.4s, v5.16b, v2.4b[0]\n"
+      "mov v4.d[1], x22\n"
+      ".inst 0x6fa2e0ac  // udot v12.4s, v5.16b, v2.4b[1]\n"
+      ".inst 0x6f82e8af  // udot v15.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x6fa2e8b2  // udot v18.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x6f83e0b5  // udot v21.4s, v5.16b, v3.4b[0]\n"
+      ".inst 0x6fa3e0b8  // udot v24.4s, v5.16b, v3.4b[1]\n"
+      ".inst 0x6f83e8bb  // udot v27.4s, v5.16b, v3.4b[2]\n"
+      ".inst 0x6fa3e8be  // udot v30.4s, v5.16b, v3.4b[3]\n"
+      "ldr d5, [x25, #0x10]\n"
+      ".inst 0x6f82e0ca  // udot v10.4s, v6.16b, v2.4b[0]\n"
+      "mov v5.d[1], x20\n"
+      ".inst 0x6fa2e0cd  // udot v13.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6fa2e8d3  // udot v19.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6fa3e0d9  // udot v25.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6f83e8dc  // udot v28.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6fa3e8df  // udot v31.4s, v6.16b, v3.4b[3]\n"
+      "ldr d6, [x25, #0x20]\n"
+      "mov v6.d[1], x21\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      ".inst 0x6f80e088  // udot v8.4s, v4.16b, v0.4b[0]\n"
+      "add x25, x25, #0x30\n"
+      ".inst 0x6fa0e08b  // udot v11.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6f80e88e  // udot v14.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e891  // udot v17.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6f81e89a  // udot v26.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e89d  // udot v29.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6f80e0a9  // udot v9.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e0ac  // udot v12.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x6f80e8af  // udot v15.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e8b2  // udot v18.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e0b8  // udot v24.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x6f81e8bb  // udot v27.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e8be  // udot v30.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e0cd  // udot v13.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6f80e8d0  // udot v16.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e8d3  // udot v19.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e0d9  // udot v25.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6f81e8dc  // udot v28.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e8df  // udot v31.4s, v6.16b, v1.4b[3]\n"
+      "cbz x24, 5f\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ldr q7, [x25, #0x0]\n"
+      ".inst 0x6f80e0e8  // udot v8.4s, v7.16b, v0.4b[0]\n"
+      "ldr q4, [x25, #0x10]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      "ldr q5, [x25, #0x20]\n"
+      ".inst 0x6f80e8ee  // udot v14.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e8f1  // udot v17.4s, v7.16b, v0.4b[3]\n"
+      "add x25, x25, #0x30\n"
+      ".inst 0x6f81e0f4  // udot v20.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e0f7  // udot v23.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6f81e8fa  // udot v26.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e8fd  // udot v29.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6f80e089  // udot v9.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e08c  // udot v12.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6f80e88f  // udot v15.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e098  // udot v24.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6f81e89b  // udot v27.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e89e  // udot v30.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6f80e0aa  // udot v10.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e0ad  // udot v13.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e0b9  // udot v25.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x6f81e8bc  // udot v28.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e8bf  // udot v31.4s, v5.16b, v1.4b[3]\n"
+      "5:"  // multiply loop done
+      "subs x27, x27, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp
new file mode 100644
index 0000000000..42226e90f5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/generic.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_u8u32_dot_8x12(
+    const uint8_t *Apanel, const uint8_t *Bpanel,
+    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x40]\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "ldr q4, [x20, #0x0]\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "ldr q5, [x20, #0x10]\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "ldr q6, [x20, #0x20]\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      ".inst 0x6f80e088  // udot v8.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e08b  // udot v11.4s, v4.16b, v0.4b[1]\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      ".inst 0x6f80e88e  // udot v14.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e891  // udot v17.4s, v4.16b, v0.4b[3]\n"
+      "ldr q3, [%x[Apanel], #0x30]\n"
+      ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x6f81e89a  // udot v26.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e89d  // udot v29.4s, v4.16b, v1.4b[3]\n"
+      "ldr q4, [x20, #0x30]\n"
+      ".inst 0x6f80e0a9  // udot v9.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e0ac  // udot v12.4s, v5.16b, v0.4b[1]\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x6f80e8af  // udot v15.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e8b2  // udot v18.4s, v5.16b, v0.4b[3]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e0b8  // udot v24.4s, v5.16b, v1.4b[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x6f81e8bb  // udot v27.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e8be  // udot v30.4s, v5.16b, v1.4b[3]\n"
+      "ldr q5, [x20, #0x40]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e0cd  // udot v13.4s, v6.16b, v0.4b[1]\n"
+      "prfm pldl1keep, [x20, #0x100]\n"
+      ".inst 0x6f80e8d0  // udot v16.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e8d3  // udot v19.4s, v6.16b, v0.4b[3]\n"
+      "prfm pldl1keep, [x20, #0x140]\n"
+      ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e0d9  // udot v25.4s, v6.16b, v1.4b[1]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      ".inst 0x6f81e8dc  // udot v28.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e8df  // udot v31.4s, v6.16b, v1.4b[3]\n"
+      "ldr q6, [x20, #0x50]\n"
+      "add x20, x20, #0x60\n"
+      ".inst 0x6f82e088  // udot v8.4s, v4.16b, v2.4b[0]\n"
+      ".inst 0x6fa2e08b  // udot v11.4s, v4.16b, v2.4b[1]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      ".inst 0x6f82e88e  // udot v14.4s, v4.16b, v2.4b[2]\n"
+      ".inst 0x6fa2e891  // udot v17.4s, v4.16b, v2.4b[3]\n"
+      ".inst 0x6f83e094  // udot v20.4s, v4.16b, v3.4b[0]\n"
+      ".inst 0x6fa3e097  // udot v23.4s, v4.16b, v3.4b[1]\n"
+      ".inst 0x6f83e89a  // udot v26.4s, v4.16b, v3.4b[2]\n"
+      ".inst 0x6fa3e89d  // udot v29.4s, v4.16b, v3.4b[3]\n"
+      "ldr q4, [x20, #0x0]\n"
+      ".inst 0x6f82e0a9  // udot v9.4s, v5.16b, v2.4b[0]\n"
+      ".inst 0x6fa2e0ac  // udot v12.4s, v5.16b, v2.4b[1]\n"
+      ".inst 0x6f82e8af  // udot v15.4s, v5.16b, v2.4b[2]\n"
+      ".inst 0x6fa2e8b2  // udot v18.4s, v5.16b, v2.4b[3]\n"
+      ".inst 0x6f83e0b5  // udot v21.4s, v5.16b, v3.4b[0]\n"
+      ".inst 0x6fa3e0b8  // udot v24.4s, v5.16b, v3.4b[1]\n"
+      ".inst 0x6f83e8bb  // udot v27.4s, v5.16b, v3.4b[2]\n"
+      ".inst 0x6fa3e8be  // udot v30.4s, v5.16b, v3.4b[3]\n"
+      "ldr q5, [x20, #0x10]\n"
+      ".inst 0x6f82e0ca  // udot v10.4s, v6.16b, v2.4b[0]\n"
+      ".inst 0x6fa2e0cd  // udot v13.4s, v6.16b, v2.4b[1]\n"
+      ".inst 0x6f82e8d0  // udot v16.4s, v6.16b, v2.4b[2]\n"
+      ".inst 0x6fa2e8d3  // udot v19.4s, v6.16b, v2.4b[3]\n"
+      ".inst 0x6f83e0d6  // udot v22.4s, v6.16b, v3.4b[0]\n"
+      ".inst 0x6fa3e0d9  // udot v25.4s, v6.16b, v3.4b[1]\n"
+      ".inst 0x6f83e8dc  // udot v28.4s, v6.16b, v3.4b[2]\n"
+      ".inst 0x6fa3e8df  // udot v31.4s, v6.16b, v3.4b[3]\n"
+      "ldr q6, [x20, #0x20]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      ".inst 0x6f80e088  // udot v8.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e08b  // udot v11.4s, v4.16b, v0.4b[1]\n"
+      "add x20, x20, #0x30\n"
+      ".inst 0x6f80e88e  // udot v14.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e891  // udot v17.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6f81e094  // udot v20.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e097  // udot v23.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6f81e89a  // udot v26.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e89d  // udot v29.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6f80e0a9  // udot v9.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e0ac  // udot v12.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x6f80e8af  // udot v15.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e8b2  // udot v18.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6f81e0b5  // udot v21.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e0b8  // udot v24.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x6f81e8bb  // udot v27.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e8be  // udot v30.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6f80e0ca  // udot v10.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e0cd  // udot v13.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6f80e8d0  // udot v16.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e8d3  // udot v19.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6f81e0d6  // udot v22.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e0d9  // udot v25.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6f81e8dc  // udot v28.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e8df  // udot v31.4s, v6.16b, v1.4b[3]\n"
+      "cbz x19, 5f\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ldr q7, [x20, #0x0]\n"
+      "ldr q4, [x20, #0x10]\n"
+      ".inst 0x6f80e0e8  // udot v8.4s, v7.16b, v0.4b[0]\n"
+      "ldr q5, [x20, #0x20]\n"
+      ".inst 0x6fa0e0eb  // udot v11.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6f80e8ee  // udot v14.4s, v7.16b, v0.4b[2]\n"
+      "add x20, x20, #0x30\n"
+      ".inst 0x6fa0e8f1  // udot v17.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6f81e0f4  // udot v20.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e0f7  // udot v23.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6f81e8fa  // udot v26.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e8fd  // udot v29.4s, v7.16b, v1.4b[3]\n"
+      ".inst 0x6f80e089  // udot v9.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e08c  // udot v12.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6f80e88f  // udot v15.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e892  // udot v18.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6f81e095  // udot v21.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e098  // udot v24.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6f81e89b  // udot v27.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e89e  // udot v30.4s, v4.16b, v1.4b[3]\n"
+      ".inst 0x6f80e0aa  // udot v10.4s, v5.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e0ad  // udot v13.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x6f80e8b0  // udot v16.4s, v5.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e8b3  // udot v19.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6f81e0b6  // udot v22.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e0b9  // udot v25.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x6f81e8bc  // udot v28.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e8bf  // udot v31.4s, v5.16b, v1.4b[3]\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp
new file mode 100644
index 0000000000..652f2bffc5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_dot_8x12/x1.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_interleaved_u8u32_dot_8x12_x1(
+    const uint8_t *Apanel, const uint8_t *Bpanel,
+    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x0]\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "prfm pldl1keep, [x20, #0x0]\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "prfm pldl1keep, [x20, #0x40]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "prfm pldl1keep, [%x[Apanel], #0x40]\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "prfm pldl1keep, [x20, #0x80]\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "ldr q2, [x20, #0x0]\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "ldr q3, [x20, #0x10]\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "ldr q4, [x20, #0x20]\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      ".inst 0x6f80e048  // udot v8.4s, v2.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e04b  // udot v11.4s, v2.16b, v0.4b[1]\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x6f80e84e  // udot v14.4s, v2.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e851  // udot v17.4s, v2.16b, v0.4b[3]\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x6f81e054  // udot v20.4s, v2.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e057  // udot v23.4s, v2.16b, v1.4b[1]\n"
+      "prfm pldl1keep, [%x[Apanel], #0x80]\n"
+      ".inst 0x6f81e85a  // udot v26.4s, v2.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e85d  // udot v29.4s, v2.16b, v1.4b[3]\n"
+      "ldr q2, [x20, #0x30]\n"
+      ".inst 0x6f80e069  // udot v9.4s, v3.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e06c  // udot v12.4s, v3.16b, v0.4b[1]\n"
+      "prfm pldl1keep, [x20, #0x100]\n"
+      ".inst 0x6f80e86f  // udot v15.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e872  // udot v18.4s, v3.16b, v0.4b[3]\n"
+      "prfm pldl1keep, [x20, #0x140]\n"
+      ".inst 0x6f81e075  // udot v21.4s, v3.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e078  // udot v24.4s, v3.16b, v1.4b[1]\n"
+      ".inst 0x6f81e87b  // udot v27.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e87e  // udot v30.4s, v3.16b, v1.4b[3]\n"
+      "ldr q3, [x20, #0x40]\n"
+      ".inst 0x6f80e08a  // udot v10.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e08d  // udot v13.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6f80e890  // udot v16.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e893  // udot v19.4s, v4.16b, v0.4b[3]\n"
+      "ldr q0, [%x[Apanel], #0x20]\n"
+      ".inst 0x6f81e096  // udot v22.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e099  // udot v25.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6f81e89c  // udot v28.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e89f  // udot v31.4s, v4.16b, v1.4b[3]\n"
+      "ldr q1, [%x[Apanel], #0x30]\n"
+      "ldr q4, [x20, #0x50]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "add x20, x20, #0x60\n"
+      ".inst 0x6f80e048  // udot v8.4s, v2.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e04b  // udot v11.4s, v2.16b, v0.4b[1]\n"
+      ".inst 0x6f80e84e  // udot v14.4s, v2.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e851  // udot v17.4s, v2.16b, v0.4b[3]\n"
+      ".inst 0x6f81e054  // udot v20.4s, v2.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e057  // udot v23.4s, v2.16b, v1.4b[1]\n"
+      ".inst 0x6f81e85a  // udot v26.4s, v2.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e85d  // udot v29.4s, v2.16b, v1.4b[3]\n"
+      "ldr q2, [x20, #0x0]\n"
+      ".inst 0x6f80e069  // udot v9.4s, v3.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e06c  // udot v12.4s, v3.16b, v0.4b[1]\n"
+      ".inst 0x6f80e86f  // udot v15.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e872  // udot v18.4s, v3.16b, v0.4b[3]\n"
+      ".inst 0x6f81e075  // udot v21.4s, v3.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e078  // udot v24.4s, v3.16b, v1.4b[1]\n"
+      ".inst 0x6f81e87b  // udot v27.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e87e  // udot v30.4s, v3.16b, v1.4b[3]\n"
+      "ldr q3, [x20, #0x10]\n"
+      ".inst 0x6f80e08a  // udot v10.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e08d  // udot v13.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6f80e890  // udot v16.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e893  // udot v19.4s, v4.16b, v0.4b[3]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      ".inst 0x6f81e096  // udot v22.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e099  // udot v25.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6f81e89c  // udot v28.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e89f  // udot v31.4s, v4.16b, v1.4b[3]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "ldr q4, [x20, #0x20]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      ".inst 0x6f80e048  // udot v8.4s, v2.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e04b  // udot v11.4s, v2.16b, v0.4b[1]\n"
+      "add x20, x20, #0x30\n"
+      ".inst 0x6f80e84e  // udot v14.4s, v2.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e851  // udot v17.4s, v2.16b, v0.4b[3]\n"
+      ".inst 0x6f81e054  // udot v20.4s, v2.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e057  // udot v23.4s, v2.16b, v1.4b[1]\n"
+      ".inst 0x6f81e85a  // udot v26.4s, v2.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e85d  // udot v29.4s, v2.16b, v1.4b[3]\n"
+      ".inst 0x6f80e069  // udot v9.4s, v3.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e06c  // udot v12.4s, v3.16b, v0.4b[1]\n"
+      ".inst 0x6f80e86f  // udot v15.4s, v3.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e872  // udot v18.4s, v3.16b, v0.4b[3]\n"
+      ".inst 0x6f81e075  // udot v21.4s, v3.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e078  // udot v24.4s, v3.16b, v1.4b[1]\n"
+      ".inst 0x6f81e87b  // udot v27.4s, v3.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e87e  // udot v30.4s, v3.16b, v1.4b[3]\n"
+      ".inst 0x6f80e08a  // udot v10.4s, v4.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e08d  // udot v13.4s, v4.16b, v0.4b[1]\n"
+      ".inst 0x6f80e890  // udot v16.4s, v4.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e893  // udot v19.4s, v4.16b, v0.4b[3]\n"
+      ".inst 0x6f81e096  // udot v22.4s, v4.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e099  // udot v25.4s, v4.16b, v1.4b[1]\n"
+      ".inst 0x6f81e89c  // udot v28.4s, v4.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e89f  // udot v31.4s, v4.16b, v1.4b[3]\n"
+      "cbz x19, 5f\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ldr q5, [x20, #0x0]\n"
+      "ldr q6, [x20, #0x10]\n"
+      ".inst 0x6f80e0a8  // udot v8.4s, v5.16b, v0.4b[0]\n"
+      "ldr q7, [x20, #0x20]\n"
+      ".inst 0x6fa0e0ab  // udot v11.4s, v5.16b, v0.4b[1]\n"
+      ".inst 0x6f80e8ae  // udot v14.4s, v5.16b, v0.4b[2]\n"
+      "add x20, x20, #0x30\n"
+      ".inst 0x6fa0e8b1  // udot v17.4s, v5.16b, v0.4b[3]\n"
+      ".inst 0x6f81e0b4  // udot v20.4s, v5.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e0b7  // udot v23.4s, v5.16b, v1.4b[1]\n"
+      ".inst 0x6f81e8ba  // udot v26.4s, v5.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e8bd  // udot v29.4s, v5.16b, v1.4b[3]\n"
+      ".inst 0x6f80e0c9  // udot v9.4s, v6.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e0cc  // udot v12.4s, v6.16b, v0.4b[1]\n"
+      ".inst 0x6f80e8cf  // udot v15.4s, v6.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e8d2  // udot v18.4s, v6.16b, v0.4b[3]\n"
+      ".inst 0x6f81e0d5  // udot v21.4s, v6.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e0d8  // udot v24.4s, v6.16b, v1.4b[1]\n"
+      ".inst 0x6f81e8db  // udot v27.4s, v6.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e8de  // udot v30.4s, v6.16b, v1.4b[3]\n"
+      ".inst 0x6f80e0ea  // udot v10.4s, v7.16b, v0.4b[0]\n"
+      ".inst 0x6fa0e0ed  // udot v13.4s, v7.16b, v0.4b[1]\n"
+      ".inst 0x6f80e8f0  // udot v16.4s, v7.16b, v0.4b[2]\n"
+      ".inst 0x6fa0e8f3  // udot v19.4s, v7.16b, v0.4b[3]\n"
+      ".inst 0x6f81e0f6  // udot v22.4s, v7.16b, v1.4b[0]\n"
+      ".inst 0x6fa1e0f9  // udot v25.4s, v7.16b, v1.4b[1]\n"
+      ".inst 0x6f81e8fc  // udot v28.4s, v7.16b, v1.4b[2]\n"
+      ".inst 0x6fa1e8ff  // udot v31.4s, v7.16b, v1.4b[3]\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "str q8, [%x[Cpanel], #0x0]\n"
+      "str q9, [%x[Cpanel], #0x10]\n"
+      "str q10, [%x[Cpanel], #0x20]\n"
+      "str q11, [%x[Cpanel], #0x30]\n"
+      "str q12, [%x[Cpanel], #0x40]\n"
+      "str q13, [%x[Cpanel], #0x50]\n"
+      "str q14, [%x[Cpanel], #0x60]\n"
+      "str q15, [%x[Cpanel], #0x70]\n"
+      "str q16, [%x[Cpanel], #0x80]\n"
+      "str q17, [%x[Cpanel], #0x90]\n"
+      "str q18, [%x[Cpanel], #0xa0]\n"
+      "str q19, [%x[Cpanel], #0xb0]\n"
+      "str q20, [%x[Cpanel], #0xc0]\n"
+      "str q21, [%x[Cpanel], #0xd0]\n"
+      "str q22, [%x[Cpanel], #0xe0]\n"
+      "str q23, [%x[Cpanel], #0xf0]\n"
+      "str q24, [%x[Cpanel], #0x100]\n"
+      "str q25, [%x[Cpanel], #0x110]\n"
+      "str q26, [%x[Cpanel], #0x120]\n"
+      "str q27, [%x[Cpanel], #0x130]\n"
+      "str q28, [%x[Cpanel], #0x140]\n"
+      "str q29, [%x[Cpanel], #0x150]\n"
+      "str q30, [%x[Cpanel], #0x160]\n"
+      "str q31, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
index 99dd0be0d9..f492a474ae 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,64 +10,103 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
 
 #ifdef __aarch64__
-
-#include <cstdint>
 #include "../std_transforms_fixed.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const uint8_t *, const uint8_t *, \
+    uint32_t *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void a64_interleaved_u8u32_mmla_8x12(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_interleaved_u8u32_mmla_8x12( ARGLIST );
 
-class cls_a64_interleaved_u8u32_mmla_8x12 {
+class cls_a64_interleaved_u8u32_mmla_8x12
+{
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
 
-    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
     static unsigned int out_width()
     {
         return 12;
     }
 
-    static unsigned int out_height()
+    static unsigned int stripe_width()
     {
-        return 8;
+        return 4;
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 8;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
     StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=a64_interleaved_u8u32_mmla_8x12;
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 62.58, 4.06, 8.02 };
+                case CPUModel::A510:
+                    return { 47.83, 3.59, 3.72 };
+                case CPUModel::V1:
+                    return { 111.52, 4.97, 10.80 };
+            }
+        }
+
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 62.57, 4.10, 0.51 };
+                case CPUModel::A510:
+                    return { 47.66, 2.47, 0.29 };
+                case CPUModel::V1:
+                    return { 111.60, 4.95, 0.66 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
+    // Default to the generic kernel
+    kern_type kernel=a64_interleaved_u8u32_mmla_8x12;
     cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
+
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
index 238a703708..e67d17e49a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
@@ -23,395 +23,269 @@
  */
 #ifdef __aarch64__
 
+#include <cstddef>
 #include <cstdint>
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const uint8_t *a_ptr = Apanel;
-    uint32_t *c_ptr = Cpanel;
-
-    K /= 8;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
-
-    for (int yb=0; yb<ablocks; yb++) {
-        const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr = Bpanel;
-
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "movi v8.4s, #0\n"
-                "ldr q0, [%[a_ptr]]\n"
-                "movi v9.4s, #0\n"
-                "ldr q4, [%[b_ptr]]\n"
-                "movi v10.4s, #0\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                "movi v11.4s, #0\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                "movi v12.4s, #0\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                "movi v13.4s, #0\n"
-                "movi v14.4s, #0\n"
-                "movi v15.4s, #0\n"
-                "movi v16.4s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "movi v17.4s, #0\n"
-                "add %[b_ptr], %[b_ptr], #0x40\n"
-                "movi v18.4s, #0\n"
-                "movi v19.4s, #0\n"
-                "movi v20.4s, #0\n"
-                "movi v21.4s, #0\n"
-                "movi v22.4s, #0\n"
-                "movi v23.4s, #0\n"
-                "movi v24.4s, #0\n"
-                "movi v25.4s, #0\n"
-                "movi v26.4s, #0\n"
-                "movi v27.4s, #0\n"
-                "movi v28.4s, #0\n"
-                "movi v29.4s, #0\n"
-                "movi v30.4s, #0\n"
-                "movi v31.4s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
-
-                "ldp q6, q7, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
-
-                ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
-
-                ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
-                "subs %[loops], %[loops], #0x1\n"
-
-                ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
-                ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
-
-                "ldp q4, q5, [%[b_ptr]]\n"
-                ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
-
-                ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
-
-                ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
-                ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
-
-                ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
-
-                ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
-                "ldp q6, q7, [%[b_ptr], #0x20]\n"
-
-                ".inst 0x6e84a40c // ummla v12.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
-                "ldr q0, [%[a_ptr]]\n"
-
-                ".inst 0x6e84a432 // ummla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-
-                ".inst 0x6e84a458 // ummla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-
-                ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n"
-                ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-
-                ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
-                ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
-
-                "ldp q4, q5, [%[b_ptr], #0x40]\n"
-                ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
-
-                ".inst 0x6e87a42f // ummla v15.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
-
-                ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
-
-                ".inst 0x6e87a47b // ummla v27.4s, v3.16b, v7.16b\n"
-                "add %[b_ptr], %[b_ptr], #0xc0\n"
-                "ldp q6, q7, [%[b_ptr], #-0x60]\n"
-
-                ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
-
-                ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
-
-                ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n"
-                ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n"
-
-                ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
-                ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
-
-                "ldp q4, q5, [%[b_ptr], #-0x40]\n"
-                ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
-
-                ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6e86a432 // ummla v18.4s, v1.16b, v6.16b\n"
-
-                ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6e86a458 // ummla v24.4s, v2.16b, v6.16b\n"
-
-                ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n"
-
-                ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
-                "b.ne 2b\n"
-
-                "1:\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
-                ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
-                "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
-                ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #0x30]\n"
-                ".inst 0x6e84a40c // ummla v12.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e84a432 // ummla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e84a458 // ummla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #0x40]\n"
-                ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
-                "ldr q0, [%[a_ptr]]\n"
-                ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
-                "ldr q1, [%[a_ptr], #0x10]\n"
-                ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
-                "ldr q2, [%[a_ptr], #0x20]\n"
-                ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #0x50]\n"
-                ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-                ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
-                "add %[b_ptr], %[b_ptr], #0xe0\n"
-                ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x80]\n"
-                ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
-                ".inst 0x6e87a42f // ummla v15.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n"
-                ".inst 0x6e87a47b // ummla v27.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x70]\n"
-                ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x60]\n"
-                ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n"
-                ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n"
-                ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
-                ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x50]\n"
-                ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
-                ".inst 0x6e86a432 // ummla v18.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a458 // ummla v24.4s, v2.16b, v6.16b\n"
-                ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x40]\n"
-                ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x30]\n"
-                ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
-                ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
-                "uzp1 v6.2d, v14.2d, v15.2d\n"
-                ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
-                ".inst 0x6e84a40c // ummla v12.4s, v0.16b, v4.16b\n"
-                "uzp1 v7.2d, v16.2d, v17.2d\n"
-                ".inst 0x6e84a432 // ummla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e84a458 // ummla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n"
-                "uzp2 v4.2d, v10.2d, v11.2d\n"
-                ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
-                "uzp1 v0.2d, v8.2d, v9.2d\n"
-                ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
-                "uzp1 v1.2d, v10.2d, v11.2d\n"
-                ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
-                "str q0, [%[c_ptr]]\n"
-                "uzp1 v2.2d, v12.2d, v13.2d\n"
-                "uzp1 v0.2d, v18.2d, v19.2d\n"
-                ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
-                "str q1, [%[c_ptr], #0x10]\n"
-                "uzp2 v3.2d, v8.2d, v9.2d\n"
-                "uzp2 v5.2d, v12.2d, v13.2d\n"
-                "uzp2 v1.2d, v14.2d, v15.2d\n"
-                "str q2, [%[c_ptr], #0x20]\n"
-                "b 4f\n"
-                "3:\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
-                "add %[b_ptr], %[b_ptr], #0x80\n"
-                ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x80]\n"
-                ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
-                ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
-                ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
-                ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x70]\n"
-                ".inst 0x6e86a40a // ummla v10.4s, v0.16b, v6.16b\n"
-                ".inst 0x6e86a430 // ummla v16.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a456 // ummla v22.4s, v2.16b, v6.16b\n"
-                ".inst 0x6e86a47c // ummla v28.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x60]\n"
-                ".inst 0x6e87a40b // ummla v11.4s, v0.16b, v7.16b\n"
-                ".inst 0x6e87a431 // ummla v17.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e87a457 // ummla v23.4s, v2.16b, v7.16b\n"
-                ".inst 0x6e87a47d // ummla v29.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x50]\n"
-                ".inst 0x6e84a40c // ummla v12.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e84a432 // ummla v18.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e84a458 // ummla v24.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47e // ummla v30.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr], #-0x40]\n"
-                ".inst 0x6e85a40d // ummla v13.4s, v0.16b, v5.16b\n"
-                "ldr q0, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6e85a433 // ummla v19.4s, v1.16b, v5.16b\n"
-                "ldr q1, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6e85a459 // ummla v25.4s, v2.16b, v5.16b\n"
-                "ldr q2, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6e85a47f // ummla v31.4s, v3.16b, v5.16b\n"
-                "ldr q5, [%[b_ptr], #-0x30]\n"
-                ".inst 0x6e86a408 // ummla v8.4s, v0.16b, v6.16b\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6e86a42e // ummla v14.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a454 // ummla v20.4s, v2.16b, v6.16b\n"
-                ".inst 0x6e87a409 // ummla v9.4s, v0.16b, v7.16b\n"
-                ".inst 0x6e86a47a // ummla v26.4s, v3.16b, v6.16b\n"
-                "ldr q6, [%[b_ptr], #-0x20]\n"
-                ".inst 0x6e87a42f // ummla v15.4s, v1.16b, v7.16b\n"
-                ".inst 0x6e87a455 // ummla v21.4s, v2.16b, v7.16b\n"
-                ".inst 0x6e87a47b // ummla v27.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                ".inst 0x6e84a40a // ummla v10.4s, v0.16b, v4.16b\n"
-                ".inst 0x6e84a430 // ummla v16.4s, v1.16b, v4.16b\n"
-                ".inst 0x6e84a456 // ummla v22.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47c // ummla v28.4s, v3.16b, v4.16b\n"
-                ".inst 0x6e85a40b // ummla v11.4s, v0.16b, v5.16b\n"
-                ".inst 0x6e85a431 // ummla v17.4s, v1.16b, v5.16b\n"
-                ".inst 0x6e85a457 // ummla v23.4s, v2.16b, v5.16b\n"
-                ".inst 0x6e85a47d // ummla v29.4s, v3.16b, v5.16b\n"
-                "uzp2 v4.2d, v10.2d, v11.2d\n"
-                ".inst 0x6e86a40c // ummla v12.4s, v0.16b, v6.16b\n"
-                ".inst 0x6e86a432 // ummla v18.4s, v1.16b, v6.16b\n"
-                ".inst 0x6e86a458 // ummla v24.4s, v2.16b, v6.16b\n"
-                ".inst 0x6e86a47e // ummla v30.4s, v3.16b, v6.16b\n"
-                "uzp1 v6.2d, v14.2d, v15.2d\n"
-                ".inst 0x6e87a40d // ummla v13.4s, v0.16b, v7.16b\n"
-                "uzp1 v0.2d, v8.2d, v9.2d\n"
-                ".inst 0x6e87a433 // ummla v19.4s, v1.16b, v7.16b\n"
-                "uzp1 v1.2d, v10.2d, v11.2d\n"
-                "uzp2 v5.2d, v12.2d, v13.2d\n"
-                "str q0, [%[c_ptr]]\n"
-                ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
-                "uzp1 v2.2d, v12.2d, v13.2d\n"
-                "uzp1 v0.2d, v18.2d, v19.2d\n"
-                "str q1, [%[c_ptr], #0x10]\n"
-                "uzp2 v1.2d, v14.2d, v15.2d\n"
-                ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
-                "uzp2 v3.2d, v8.2d, v9.2d\n"
-                "str q2, [%[c_ptr], #0x20]\n"
-                "uzp1 v7.2d, v16.2d, v17.2d\n"
-                "4:\n"
-                "uzp2 v2.2d, v16.2d, v17.2d\n"
-                "str q3, [%[c_ptr], #0x30]\n"
-                "uzp2 v3.2d, v18.2d, v19.2d\n"
-                "str q4, [%[c_ptr], #0x40]\n"
-                "uzp1 v4.2d, v20.2d, v21.2d\n"
-                "str q5, [%[c_ptr], #0x50]\n"
-                "uzp1 v5.2d, v22.2d, v23.2d\n"
-                "str q6, [%[c_ptr], #0x60]\n"
-                "uzp1 v6.2d, v24.2d, v25.2d\n"
-                "str q7, [%[c_ptr], #0x70]\n"
-                "uzp2 v7.2d, v20.2d, v21.2d\n"
-                "str q0, [%[c_ptr], #0x80]\n"
-                "uzp2 v0.2d, v22.2d, v23.2d\n"
-                "str q1, [%[c_ptr], #0x90]\n"
-                "uzp2 v1.2d, v24.2d, v25.2d\n"
-                "str q2, [%[c_ptr], #0xa0]\n"
-                "uzp1 v2.2d, v26.2d, v27.2d\n"
-                "str q3, [%[c_ptr], #0xb0]\n"
-                "uzp1 v3.2d, v28.2d, v29.2d\n"
-                "str q4, [%[c_ptr], #0xc0]\n"
-                "uzp1 v4.2d, v30.2d, v31.2d\n"
-                "str q5, [%[c_ptr], #0xd0]\n"
-                "uzp2 v5.2d, v26.2d, v27.2d\n"
-                "str q6, [%[c_ptr], #0xe0]\n"
-                "uzp2 v6.2d, v28.2d, v29.2d\n"
-                "str q7, [%[c_ptr], #0xf0]\n"
-                "uzp2 v7.2d, v30.2d, v31.2d\n"
-                "str q0, [%[c_ptr], #0x100]\n"
-                "str q1, [%[c_ptr], #0x110]\n"
-                "str q2, [%[c_ptr], #0x120]\n"
-                "str q3, [%[c_ptr], #0x130]\n"
-                "str q4, [%[c_ptr], #0x140]\n"
-                "str q5, [%[c_ptr], #0x150]\n"
-                "str q6, [%[c_ptr], #0x160]\n"
-                "str q7, [%[c_ptr], #0x170]\n"
-                "add %[c_ptr], %[c_ptr], #0x180\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-            );
-        }
-    }
+void a64_interleaved_u8u32_mmla_8x12(
+    const uint8_t *Apanel, const uint8_t *Bpanel,
+    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "movi v8.4s, #0x0\n"
+      "movi v9.4s, #0x0\n"
+      "ldr q4, [x20, #0x0]\n"
+      "movi v10.4s, #0x0\n"
+      "movi v11.4s, #0x0\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      "movi v12.4s, #0x0\n"
+      "movi v13.4s, #0x0\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "movi v14.4s, #0x0\n"
+      "movi v15.4s, #0x0\n"
+      "ldr q5, [x20, #0x10]\n"
+      "movi v16.4s, #0x0\n"
+      "movi v17.4s, #0x0\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      "movi v18.4s, #0x0\n"
+      "movi v19.4s, #0x0\n"
+      "add x20, x20, #0x20\n"
+      "movi v20.4s, #0x0\n"
+      "movi v21.4s, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "movi v22.4s, #0x0\n"
+      "movi v23.4s, #0x0\n"
+      "movi v24.4s, #0x0\n"
+      "movi v25.4s, #0x0\n"
+      "movi v26.4s, #0x0\n"
+      "movi v27.4s, #0x0\n"
+      "movi v28.4s, #0x0\n"
+      "movi v29.4s, #0x0\n"
+      "movi v30.4s, #0x0\n"
+      "movi v31.4s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ldr q3, [%x[Apanel], #0x0]\n"
+      ".inst 0x6e84a408  // ummla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a42e  // ummla v14.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a40b  // ummla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a431  // ummla v17.4s, v1.16b, v5.16b\n"
+      "ldr q6, [x20, #0x0]\n"
+      ".inst 0x6e84a454  // ummla v20.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a457  // ummla v23.4s, v2.16b, v5.16b\n"
+      "ldr q7, [x20, #0x10]\n"
+      ".inst 0x6e84a47a  // ummla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a47d  // ummla v29.4s, v3.16b, v5.16b\n"
+      "ldr q4, [x20, #0x20]\n"
+      "ldr q5, [x20, #0x30]\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a47b  // ummla v27.4s, v3.16b, v6.16b\n"
+      "ldr q6, [x20, #0x40]\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e84a40a  // ummla v10.4s, v0.16b, v4.16b\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x6e85a40d  // ummla v13.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e87a432  // ummla v18.4s, v1.16b, v7.16b\n"
+      "ldr q0, [%x[Apanel], #0x10]\n"
+      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      "ldr q1, [%x[Apanel], #0x20]\n"
+      ".inst 0x6e87a458  // ummla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a47e  // ummla v30.4s, v3.16b, v7.16b\n"
+      "ldr q7, [x20, #0x50]\n"
+      ".inst 0x6e84a456  // ummla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
+      "ldr q2, [%x[Apanel], #0x30]\n"
+      ".inst 0x6e84a47c  // ummla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a47f  // ummla v31.4s, v3.16b, v5.16b\n"
+      "ldr q3, [%x[Apanel], #0x40]\n"
+      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      "ldr q4, [x20, #0x60]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a431  // ummla v17.4s, v1.16b, v7.16b\n"
+      "ldr q5, [x20, #0x70]\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x6e86a47a  // ummla v26.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a47d  // ummla v29.4s, v3.16b, v7.16b\n"
+      "ldr q6, [x20, #0x80]\n"
+      "ldr q7, [x20, #0x90]\n"
+      ".inst 0x6e84a409  // ummla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a42f  // ummla v15.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e84a455  // ummla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e84a47b  // ummla v27.4s, v3.16b, v4.16b\n"
+      "ldr q4, [x20, #0xa0]\n"
+      ".inst 0x6e85a40c  // ummla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e85a432  // ummla v18.4s, v1.16b, v5.16b\n"
+      "ldr q0, [%x[Apanel], #0x50]\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e87a433  // ummla v19.4s, v1.16b, v7.16b\n"
+      "ldr q1, [%x[Apanel], #0x60]\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e85a47e  // ummla v30.4s, v3.16b, v5.16b\n"
+      "ldr q5, [x20, #0xb0]\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      "ldr q2, [%x[Apanel], #0x70]\n"
+      ".inst 0x6e86a47c  // ummla v28.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a47f  // ummla v31.4s, v3.16b, v7.16b\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "add x20, x20, #0xc0\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ldr q3, [%x[Apanel], #0x0]\n"
+      ".inst 0x6e84a408  // ummla v8.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a42e  // ummla v14.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a40b  // ummla v11.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e85a431  // ummla v17.4s, v1.16b, v5.16b\n"
+      "ldr q6, [x20, #0x0]\n"
+      ".inst 0x6e84a454  // ummla v20.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a457  // ummla v23.4s, v2.16b, v5.16b\n"
+      "ldr q7, [x20, #0x10]\n"
+      ".inst 0x6e84a47a  // ummla v26.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a47d  // ummla v29.4s, v3.16b, v5.16b\n"
+      "ldr q4, [x20, #0x20]\n"
+      "ldr q5, [x20, #0x30]\n"
+      ".inst 0x6e86a409  // ummla v9.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e86a42f  // ummla v15.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e86a455  // ummla v21.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e86a47b  // ummla v27.4s, v3.16b, v6.16b\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x6e87a40c  // ummla v12.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e84a40a  // ummla v10.4s, v0.16b, v4.16b\n"
+      "add x20, x20, #0x40\n"
+      ".inst 0x6e85a40d  // ummla v13.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e87a432  // ummla v18.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e84a430  // ummla v16.4s, v1.16b, v4.16b\n"
+      ".inst 0x6e85a433  // ummla v19.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e87a458  // ummla v24.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e87a47e  // ummla v30.4s, v3.16b, v7.16b\n"
+      ".inst 0x6e84a456  // ummla v22.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e85a459  // ummla v25.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e84a47c  // ummla v28.4s, v3.16b, v4.16b\n"
+      ".inst 0x6e85a47f  // ummla v31.4s, v3.16b, v5.16b\n"
+      "cbz x19, 5f\n"
+      "ldr q6, [x20, #0x0]\n"
+      "ldr q0, [%x[Apanel], #0x0]\n"
+      ".inst 0x6e86a408  // ummla v8.4s, v0.16b, v6.16b\n"
+      "ldr q1, [%x[Apanel], #0x10]\n"
+      "ldr q7, [x20, #0x10]\n"
+      ".inst 0x6e86a42e  // ummla v14.4s, v1.16b, v6.16b\n"
+      "ldr q2, [%x[Apanel], #0x20]\n"
+      "ldr q3, [%x[Apanel], #0x30]\n"
+      ".inst 0x6e87a40b  // ummla v11.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e87a431  // ummla v17.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e86a454  // ummla v20.4s, v2.16b, v6.16b\n"
+      "ldr q4, [x20, #0x20]\n"
+      ".inst 0x6e87a457  // ummla v23.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a47a  // ummla v26.4s, v3.16b, v6.16b\n"
+      "ldr q5, [x20, #0x30]\n"
+      ".inst 0x6e87a47d  // ummla v29.4s, v3.16b, v7.16b\n"
+      "ldr q6, [x20, #0x40]\n"
+      "ldr q7, [x20, #0x50]\n"
+      ".inst 0x6e84a409  // ummla v9.4s, v0.16b, v4.16b\n"
+      ".inst 0x6e84a42f  // ummla v15.4s, v1.16b, v4.16b\n"
+      "add x20, x20, #0x60\n"
+      ".inst 0x6e84a455  // ummla v21.4s, v2.16b, v4.16b\n"
+      ".inst 0x6e84a47b  // ummla v27.4s, v3.16b, v4.16b\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x6e85a40c  // ummla v12.4s, v0.16b, v5.16b\n"
+      ".inst 0x6e86a40a  // ummla v10.4s, v0.16b, v6.16b\n"
+      ".inst 0x6e87a40d  // ummla v13.4s, v0.16b, v7.16b\n"
+      ".inst 0x6e85a432  // ummla v18.4s, v1.16b, v5.16b\n"
+      ".inst 0x6e86a430  // ummla v16.4s, v1.16b, v6.16b\n"
+      ".inst 0x6e87a433  // ummla v19.4s, v1.16b, v7.16b\n"
+      ".inst 0x6e85a458  // ummla v24.4s, v2.16b, v5.16b\n"
+      ".inst 0x6e85a47e  // ummla v30.4s, v3.16b, v5.16b\n"
+      ".inst 0x6e86a456  // ummla v22.4s, v2.16b, v6.16b\n"
+      ".inst 0x6e87a459  // ummla v25.4s, v2.16b, v7.16b\n"
+      ".inst 0x6e86a47c  // ummla v28.4s, v3.16b, v6.16b\n"
+      ".inst 0x6e87a47f  // ummla v31.4s, v3.16b, v7.16b\n"
+      "5:"  // multiply loop done
+      "subs x22, x22, #0x1\n"
+      "uzp1 v4.2d, v8.2d, v11.2d\n"
+      "uzp2 v8.2d, v8.2d, v11.2d\n"
+      "uzp1 v11.2d, v9.2d, v12.2d\n"
+      "uzp2 v9.2d, v9.2d, v12.2d\n"
+      "str q4, [%x[Cpanel], #0x0]\n"
+      "uzp1 v12.2d, v10.2d, v13.2d\n"
+      "uzp2 v10.2d, v10.2d, v13.2d\n"
+      "str q11, [%x[Cpanel], #0x10]\n"
+      "str q12, [%x[Cpanel], #0x20]\n"
+      "uzp1 v13.2d, v14.2d, v17.2d\n"
+      "uzp2 v14.2d, v14.2d, v17.2d\n"
+      "str q8, [%x[Cpanel], #0x30]\n"
+      "uzp1 v17.2d, v15.2d, v18.2d\n"
+      "uzp2 v15.2d, v15.2d, v18.2d\n"
+      "str q9, [%x[Cpanel], #0x40]\n"
+      "uzp1 v18.2d, v16.2d, v19.2d\n"
+      "uzp2 v16.2d, v16.2d, v19.2d\n"
+      "str q10, [%x[Cpanel], #0x50]\n"
+      "uzp1 v19.2d, v20.2d, v23.2d\n"
+      "uzp2 v20.2d, v20.2d, v23.2d\n"
+      "str q13, [%x[Cpanel], #0x60]\n"
+      "uzp1 v23.2d, v21.2d, v24.2d\n"
+      "uzp2 v21.2d, v21.2d, v24.2d\n"
+      "str q17, [%x[Cpanel], #0x70]\n"
+      "uzp1 v24.2d, v22.2d, v25.2d\n"
+      "uzp2 v22.2d, v22.2d, v25.2d\n"
+      "str q18, [%x[Cpanel], #0x80]\n"
+      "uzp1 v25.2d, v26.2d, v29.2d\n"
+      "uzp2 v26.2d, v26.2d, v29.2d\n"
+      "str q14, [%x[Cpanel], #0x90]\n"
+      "uzp1 v29.2d, v27.2d, v30.2d\n"
+      "uzp2 v27.2d, v27.2d, v30.2d\n"
+      "str q15, [%x[Cpanel], #0xa0]\n"
+      "uzp1 v30.2d, v28.2d, v31.2d\n"
+      "uzp2 v28.2d, v28.2d, v31.2d\n"
+      "str q16, [%x[Cpanel], #0xb0]\n"
+      "str q19, [%x[Cpanel], #0xc0]\n"
+      "str q23, [%x[Cpanel], #0xd0]\n"
+      "str q24, [%x[Cpanel], #0xe0]\n"
+      "str q20, [%x[Cpanel], #0xf0]\n"
+      "str q21, [%x[Cpanel], #0x100]\n"
+      "str q22, [%x[Cpanel], #0x110]\n"
+      "str q25, [%x[Cpanel], #0x120]\n"
+      "str q29, [%x[Cpanel], #0x130]\n"
+      "str q30, [%x[Cpanel], #0x140]\n"
+      "str q26, [%x[Cpanel], #0x150]\n"
+      "str q27, [%x[Cpanel], #0x160]\n"
+      "str q28, [%x[Cpanel], #0x170]\n"
+      "add %x[Cpanel], %x[Cpanel], #0x180\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // __aarch64__
-\ No newline at end of file
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
index f327e84861..39399e0ba8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
@@ -28,6 +28,8 @@
 #include "../std_transforms_fixed.hpp"
 #include "../performance_parameters.hpp"
 
+#include "../bfloat.hpp"
+
 namespace arm_gemm {
 
 // Actual kernel implementations
@@ -68,19 +70,32 @@ public:
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
 
+    template<typename T>
     static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
-        switch (ci->get_cpu_model()) {
-            case CPUModel::A55r1:
-                return { 3.954, 1.252, 1.141 };
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                case CPUModel::A55r1:
+                    return { 3.954, 1.252, 1.141 };
 
-            case CPUModel::A53:
-                return { 2.777, 0.987, 0.898 };
+                case CPUModel::A53:
+                    return { 2.777, 0.987, 0.898 };
 
-            case CPUModel::A73:
-                return { 2.885, 1.429, 1.163 };
+                case CPUModel::A73:
+                    return { 2.885, 1.429, 1.163 };
 
-            default:
-                return { 7.2307, 3.876, 2.932 };
+                default:
+                    return { 7.2307, 3.876, 2.932 };
+            }
+        }
+
+        if (std::is_same<T, bfloat16>::value) {
+            switch(ci->get_cpu_model()) {
+                case CPUModel::A510:
+                    return { 4.98, 2.27, 3.05 };
+
+                default:
+                    return { 7.99, 5.06, 7.32 };
+            }
         }
     }
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
index 2e87a47036..52548b462c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
@@ -24,7 +24,6 @@
 #ifdef __aarch64__
 
 #include <algorithm>
-#include <limits>
 
 #include "arm_gemm.hpp"
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
index ca4a44a2c7..deaef27ee9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
@@ -24,7 +24,6 @@
 #ifdef __aarch64__
 
 #include <algorithm>
-#include <limits>
 
 #include "arm_gemm.hpp"
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp
deleted file mode 100644
index 57fd9c909e..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "../performance_parameters.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_gemv_fp32_mla_8VL(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
-
-class cls_sve_gemv_fp32_mla_8VL
-{
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
-
-    static unsigned int out_width()
-    {
-        return 8 * get_vector_length<float>();
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 1;
-    }
-
-    static constexpr bool supports_accumulate()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-    StdTransformsSVE<operand_type, result_type, 1, 8, 1> transforms = {};
-
-    // Default to the generic kernel
-    kern_type kernel=sve_gemv_fp32_mla_8VL;
-
-    cls_sve_gemv_fp32_mla_8VL(const CPUInfo *)
-    {
-    }
-};
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp
deleted file mode 100644
index 78387de90c..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp
+++ /dev/null
@@ -1,1372 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include "arm_gemm.hpp"
-#include "../../utils.hpp"
-
-#include <cassert>
-
-namespace arm_gemm {
-
-void sve_gemv_fp32_mla_8VL (
-    const float *A_ptr, const float *B_ptr, float *output_ptr,
-    size_t N, size_t K,
-    const float *bias, Activation act, bool
-)
-{
-    struct KernelArgs {
-        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
-        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-        const float *B_ptr = {};
-        size_t output_offset = {};
-        unsigned int input_initial_col = {};
-    } ka;
-
-    unsigned long flags=0;
-    ka.B_ptr = B_ptr;
-    switch(act.type) {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            ka.maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            ka.minval = 0;
-            flags |= 0x2;
-            break;
-    }
-    __asm__ __volatile__(
-      "ptrue p2.b\n"
-      "cntw x24\n"
-      "add x23, %x[N], x24\n"
-      "sub x23, x23, #0x1\n"
-      "udiv x23, x23, x24\n"
-      "mov x22, %x[bias]\n"
-      "1:"  // Column loop
-      "cmp x23, #0x8\n"
-      "bge 50f\n"
-      "cmp x23, #0x6\n"
-      "bgt 43f\n"
-      "beq 36f\n"
-      "cmp x23, #0x4\n"
-      "bgt 29f\n"
-      "beq 22f\n"
-      "cmp x23, #0x2\n"
-      "bgt 15f\n"
-      "beq 8f\n"
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "whilelt p1.s, XZR, %x[N]\n"
-      "cbz x22, 2f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "addvl x22, x22, #1\n"
-      "b 3f\n"
-      "2:"  // Width 1: no bias
-      "mov z24.b, #0x0\n"
-      "3:"  // Width 1: setup done
-      "cmp x21, #0x4\n"
-      "ble 5f\n"
-      "4:"  // Width 1: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x20, x20, #0x10\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z2.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "cmp x21, #0x4\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z3.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z4.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "bgt 4b\n"
-      "5:"  // Width 1: Multiply loop: Single iteration only
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z5.s, z0.s[0]\n"
-      "add x20, x20, #0x10\n"
-      "subs x21, x21, #0x1\n"
-      "ble 6f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z6.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "ble 6f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z7.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "ble 6f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z8.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "6:"  // Width 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 7f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "7:"  // Width 1: No activation
-      "st1w { z24.s }, p1, [%x[output_ptr]]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #1\n"
-      "b 57f\n"
-      "8:"  // Width 2
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "sub x19, %x[N], x24\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 9f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "addvl x22, x22, #2\n"
-      "b 10f\n"
-      "9:"  // Width 2: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "10:"  // Width 2: setup done
-      "cmp x21, #0x4\n"
-      "ble 12f\n"
-      "11:"  // Width 2: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z3.s, z0.s[1]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z25.s, z4.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "cmp x21, #0x4\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z5.s, z0.s[2]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z25.s, z6.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z7.s, z0.s[3]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z25.s, z8.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "bgt 11b\n"
-      "12:"  // Width 2: Multiply loop: Single iteration only
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z9.s, z0.s[0]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z10.s, z0.s[0]\n"
-      "subs x21, x21, #0x1\n"
-      "ble 13f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z11.s, z0.s[1]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z25.s, z12.s, z0.s[1]\n"
-      "ble 13f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z13.s, z0.s[2]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z25.s, z14.s, z0.s[2]\n"
-      "ble 13f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z15.s, z0.s[3]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z25.s, z16.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "13:"  // Width 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 14f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "14:"  // Width 2: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "st1w { z25.s }, p1, [%x[output_ptr], #1, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #2\n"
-      "b 57f\n"
-      "15:"  // Width 3
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "mov x19, #0x2\n"
-      "msub x19, x24, x19, %x[N]\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 16f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "addvl x22, x22, #3\n"
-      "b 17f\n"
-      "16:"  // Width 3: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "mov z26.b, #0x0\n"
-      "17:"  // Width 3: setup done
-      "cmp x21, #0x4\n"
-      "ble 19f\n"
-      "18:"  // Width 3: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z26.s, z3.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "cmp x21, #0x4\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z4.s, z0.s[1]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z5.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z26.s, z6.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z7.s, z0.s[2]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z8.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z26.s, z9.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z10.s, z0.s[3]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z11.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z26.s, z12.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "bgt 18b\n"
-      "19:"  // Width 3: Multiply loop: Single iteration only
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z13.s, z0.s[0]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z14.s, z0.s[0]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z15.s, z0.s[0]\n"
-      "ble 20f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z16.s, z0.s[1]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z17.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z18.s, z0.s[1]\n"
-      "ble 20f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z19.s, z0.s[2]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z20.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z21.s, z0.s[2]\n"
-      "ble 20f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z22.s, z0.s[3]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z23.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z26.s, z1.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "20:"  // Width 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 21f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "21:"  // Width 3: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
-      "st1w { z26.s }, p1, [%x[output_ptr], #2, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #3\n"
-      "b 57f\n"
-      "22:"  // Width 4
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "mov x19, #0x3\n"
-      "msub x19, x24, x19, %x[N]\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 23f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
-      "addvl x22, x22, #4\n"
-      "b 24f\n"
-      "23:"  // Width 4: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "mov z26.b, #0x0\n"
-      "mov z27.b, #0x0\n"
-      "24:"  // Width 4: setup done
-      "cmp x21, #0x4\n"
-      "ble 26f\n"
-      "25:"  // Width 4: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z3.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "cmp x21, #0x4\n"
-      "fmla z27.s, z4.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z5.s, z0.s[1]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z6.s, z0.s[1]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z7.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z27.s, z8.s, z0.s[1]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z9.s, z0.s[2]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z10.s, z0.s[2]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z11.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z27.s, z12.s, z0.s[2]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z13.s, z0.s[3]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z14.s, z0.s[3]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z15.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z27.s, z16.s, z0.s[3]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "bgt 25b\n"
-      "26:"  // Width 4: Multiply loop: Single iteration only
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z17.s, z0.s[0]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z18.s, z0.s[0]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z19.s, z0.s[0]\n"
-      "fmla z27.s, z20.s, z0.s[0]\n"
-      "ble 27f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z21.s, z0.s[1]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z22.s, z0.s[1]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z23.s, z0.s[1]\n"
-      "fmla z27.s, z1.s, z0.s[1]\n"
-      "ble 27f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z2.s, z0.s[2]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z3.s, z0.s[2]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z4.s, z0.s[2]\n"
-      "fmla z27.s, z5.s, z0.s[2]\n"
-      "ble 27f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z6.s, z0.s[3]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z7.s, z0.s[3]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z26.s, z8.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z27.s, z9.s, z0.s[3]\n"
-      "27:"  // Width 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 28f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "28:"  // Width 4: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
-      "st1w { z27.s }, p1, [%x[output_ptr], #3, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #4\n"
-      "b 57f\n"
-      "29:"  // Width 5
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "mov x19, #0x4\n"
-      "msub x19, x24, x19, %x[N]\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 30f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
-      "addvl x22, x22, #5\n"
-      "b 31f\n"
-      "30:"  // Width 5: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "mov z26.b, #0x0\n"
-      "mov z27.b, #0x0\n"
-      "mov z28.b, #0x0\n"
-      "31:"  // Width 5: setup done
-      "cmp x21, #0x4\n"
-      "ble 33f\n"
-      "32:"  // Width 5: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "cmp x21, #0x4\n"
-      "fmla z26.s, z3.s, z0.s[0]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z4.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z28.s, z5.s, z0.s[0]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z6.s, z0.s[1]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z7.s, z0.s[1]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z8.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z27.s, z9.s, z0.s[1]\n"
-      "fmla z28.s, z10.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z11.s, z0.s[2]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z12.s, z0.s[2]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z13.s, z0.s[2]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z14.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z28.s, z15.s, z0.s[2]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z16.s, z0.s[3]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z17.s, z0.s[3]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z18.s, z0.s[3]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z19.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z28.s, z20.s, z0.s[3]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "bgt 32b\n"
-      "33:"  // Width 5: Multiply loop: Single iteration only
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z21.s, z0.s[0]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z22.s, z0.s[0]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z23.s, z0.s[0]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z1.s, z0.s[0]\n"
-      "fmla z28.s, z2.s, z0.s[0]\n"
-      "ble 34f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z3.s, z0.s[1]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z4.s, z0.s[1]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z5.s, z0.s[1]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z6.s, z0.s[1]\n"
-      "fmla z28.s, z7.s, z0.s[1]\n"
-      "ble 34f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z8.s, z0.s[2]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z9.s, z0.s[2]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z10.s, z0.s[2]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z11.s, z0.s[2]\n"
-      "fmla z28.s, z12.s, z0.s[2]\n"
-      "ble 34f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z13.s, z0.s[3]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z14.s, z0.s[3]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z15.s, z0.s[3]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z16.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z28.s, z17.s, z0.s[3]\n"
-      "34:"  // Width 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 35f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmin z28.s, p2/M, z28.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "fmax z28.s, p2/M, z28.s, z17.s\n"
-      "35:"  // Width 5: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
-      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
-      "st1w { z28.s }, p1, [%x[output_ptr], #4, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #5\n"
-      "b 57f\n"
-      "36:"  // Width 6
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "mov x19, #0x5\n"
-      "msub x19, x24, x19, %x[N]\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 37f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
-      "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
-      "addvl x22, x22, #6\n"
-      "b 38f\n"
-      "37:"  // Width 6: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "mov z26.b, #0x0\n"
-      "mov z27.b, #0x0\n"
-      "mov z28.b, #0x0\n"
-      "mov z29.b, #0x0\n"
-      "38:"  // Width 6: setup done
-      "cmp x21, #0x4\n"
-      "ble 40f\n"
-      "39:"  // Width 6: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "cmp x21, #0x4\n"
-      "fmla z26.s, z3.s, z0.s[0]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z4.s, z0.s[0]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z5.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z29.s, z6.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z7.s, z0.s[1]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z8.s, z0.s[1]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z9.s, z0.s[1]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z10.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z28.s, z11.s, z0.s[1]\n"
-      "fmla z29.s, z12.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z13.s, z0.s[2]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z14.s, z0.s[2]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z15.s, z0.s[2]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z16.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z28.s, z17.s, z0.s[2]\n"
-      "fmla z29.s, z18.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z19.s, z0.s[3]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z20.s, z0.s[3]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z21.s, z0.s[3]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z22.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z28.s, z23.s, z0.s[3]\n"
-      "fmla z29.s, z1.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "bgt 39b\n"
-      "40:"  // Width 6: Multiply loop: Single iteration only
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z2.s, z0.s[0]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z3.s, z0.s[0]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z4.s, z0.s[0]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z27.s, z5.s, z0.s[0]\n"
-      "fmla z28.s, z6.s, z0.s[0]\n"
-      "fmla z29.s, z7.s, z0.s[0]\n"
-      "ble 41f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z8.s, z0.s[1]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z9.s, z0.s[1]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z10.s, z0.s[1]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z11.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z12.s, z0.s[1]\n"
-      "fmla z29.s, z13.s, z0.s[1]\n"
-      "ble 41f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z14.s, z0.s[2]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z15.s, z0.s[2]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z16.s, z0.s[2]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z17.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z18.s, z0.s[2]\n"
-      "fmla z29.s, z19.s, z0.s[2]\n"
-      "ble 41f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z20.s, z0.s[3]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z21.s, z0.s[3]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z22.s, z0.s[3]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z23.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z28.s, z1.s, z0.s[3]\n"
-      "fmla z29.s, z2.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "41:"  // Width 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 42f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmin z28.s, p2/M, z28.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "fmax z28.s, p2/M, z28.s, z17.s\n"
-      "fmin z29.s, p2/M, z29.s, z16.s\n"
-      "fmax z29.s, p2/M, z29.s, z17.s\n"
-      "42:"  // Width 6: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
-      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
-      "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
-      "st1w { z29.s }, p1, [%x[output_ptr], #5, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #6\n"
-      "b 57f\n"
-      "43:"  // Width 7
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "mov x19, #0x6\n"
-      "msub x19, x24, x19, %x[N]\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 44f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
-      "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
-      "addvl x22, x22, #7\n"
-      "b 45f\n"
-      "44:"  // Width 7: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "mov z26.b, #0x0\n"
-      "mov z27.b, #0x0\n"
-      "mov z28.b, #0x0\n"
-      "mov z29.b, #0x0\n"
-      "mov z30.b, #0x0\n"
-      "45:"  // Width 7: setup done
-      "cmp x21, #0x4\n"
-      "ble 47f\n"
-      "46:"  // Width 7: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "cmp x21, #0x4\n"
-      "fmla z26.s, z3.s, z0.s[0]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z4.s, z0.s[0]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z28.s, z5.s, z0.s[0]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z29.s, z6.s, z0.s[0]\n"
-      "fmla z30.s, z7.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z8.s, z0.s[1]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z9.s, z0.s[1]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z10.s, z0.s[1]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z11.s, z0.s[1]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z12.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z29.s, z13.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z30.s, z14.s, z0.s[1]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z15.s, z0.s[2]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z25.s, z16.s, z0.s[2]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z17.s, z0.s[2]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z18.s, z0.s[2]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z19.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z29.s, z20.s, z0.s[2]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z30.s, z21.s, z0.s[2]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z22.s, z0.s[3]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z25.s, z23.s, z0.s[3]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z1.s, z0.s[3]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z2.s, z0.s[3]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z3.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z29.s, z4.s, z0.s[3]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "fmla z30.s, z5.s, z0.s[3]\n"
-      "bgt 46b\n"
-      "47:"  // Width 7: Multiply loop: Single iteration only
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z6.s, z0.s[0]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z7.s, z0.s[0]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z8.s, z0.s[0]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z27.s, z9.s, z0.s[0]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z10.s, z0.s[0]\n"
-      "fmla z29.s, z11.s, z0.s[0]\n"
-      "fmla z30.s, z12.s, z0.s[0]\n"
-      "ble 48f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z13.s, z0.s[1]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z14.s, z0.s[1]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z15.s, z0.s[1]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z16.s, z0.s[1]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z17.s, z0.s[1]\n"
-      "fmla z29.s, z18.s, z0.s[1]\n"
-      "fmla z30.s, z19.s, z0.s[1]\n"
-      "ble 48f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z20.s, z0.s[2]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z21.s, z0.s[2]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z22.s, z0.s[2]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z23.s, z0.s[2]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z1.s, z0.s[2]\n"
-      "fmla z29.s, z2.s, z0.s[2]\n"
-      "fmla z30.s, z3.s, z0.s[2]\n"
-      "ble 48f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z4.s, z0.s[3]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z5.s, z0.s[3]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z6.s, z0.s[3]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z7.s, z0.s[3]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z28.s, z8.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z29.s, z9.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z30.s, z10.s, z0.s[3]\n"
-      "48:"  // Width 7: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 49f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmin z28.s, p2/M, z28.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "fmax z28.s, p2/M, z28.s, z17.s\n"
-      "fmin z29.s, p2/M, z29.s, z16.s\n"
-      "fmin z30.s, p2/M, z30.s, z16.s\n"
-      "fmax z29.s, p2/M, z29.s, z17.s\n"
-      "fmax z30.s, p2/M, z30.s, z17.s\n"
-      "49:"  // Width 7: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
-      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
-      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
-      "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
-      "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
-      "st1w { z30.s }, p1, [%x[output_ptr], #6, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #7\n"
-      "b 57f\n"
-      "50:"  // Width 8
-      "mov x21, %x[K]\n"
-      "mov x20, %x[A_ptr]\n"
-      "mov x19, #0x7\n"
-      "msub x19, x24, x19, %x[N]\n"
-      "whilelt p1.s, XZR, x19\n"
-      "cbz x22, 51f\n"
-      "ld1w { z24.s }, p2/Z, [x22]\n"
-      "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
-      "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
-      "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
-      "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
-      "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
-      "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
-      "ld1w { z31.s }, p2/Z, [x22, #7, MUL VL]\n"
-      "addvl x22, x22, #8\n"
-      "b 52f\n"
-      "51:"  // Width 8: no bias
-      "mov z24.b, #0x0\n"
-      "mov z25.b, #0x0\n"
-      "mov z26.b, #0x0\n"
-      "mov z27.b, #0x0\n"
-      "mov z28.b, #0x0\n"
-      "mov z29.b, #0x0\n"
-      "mov z30.b, #0x0\n"
-      "mov z31.b, #0x0\n"
-      "52:"  // Width 8: setup done
-      "cmp x21, #0x4\n"
-      "ble 54f\n"
-      "53:"  // Width 8: Multiply loop: Main loop head
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "sub x21, x21, #0x4\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z1.s, z0.s[0]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z2.s, z0.s[0]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "cmp x21, #0x4\n"
-      "fmla z26.s, z3.s, z0.s[0]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z4.s, z0.s[0]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z28.s, z5.s, z0.s[0]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z6.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "fmla z30.s, z7.s, z0.s[0]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z31.s, z8.s, z0.s[0]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z9.s, z0.s[1]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z25.s, z10.s, z0.s[1]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z11.s, z0.s[1]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z12.s, z0.s[1]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z28.s, z13.s, z0.s[1]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z14.s, z0.s[1]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z30.s, z15.s, z0.s[1]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z31.s, z16.s, z0.s[1]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z17.s, z0.s[2]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z25.s, z18.s, z0.s[2]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z19.s, z0.s[2]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z20.s, z0.s[2]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z28.s, z21.s, z0.s[2]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z22.s, z0.s[2]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z30.s, z23.s, z0.s[2]\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z31.s, z1.s, z0.s[2]\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "fmla z24.s, z2.s, z0.s[3]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z25.s, z3.s, z0.s[3]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z4.s, z0.s[3]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z5.s, z0.s[3]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z28.s, z6.s, z0.s[3]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z7.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z30.s, z8.s, z0.s[3]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "fmla z31.s, z9.s, z0.s[3]\n"
-      "bgt 53b\n"
-      "54:"  // Width 8: Multiply loop: Single iteration only
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
-      "whilelt p0.s, XZR, x21\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1rqw { z0.s }, p0/Z, [x20]\n"
-      "fmla z24.s, z10.s, z0.s[0]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z25.s, z11.s, z0.s[0]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "fmla z26.s, z12.s, z0.s[0]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "fmla z27.s, z13.s, z0.s[0]\n"
-      "fmla z28.s, z14.s, z0.s[0]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z15.s, z0.s[0]\n"
-      "fmla z30.s, z16.s, z0.s[0]\n"
-      "fmla z31.s, z17.s, z0.s[0]\n"
-      "ble 55f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z18.s, z0.s[1]\n"
-      "ld1w { z19.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z20.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z19.s, z0.s[1]\n"
-      "ld1w { z21.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z20.s, z0.s[1]\n"
-      "ld1w { z22.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z23.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z21.s, z0.s[1]\n"
-      "ld1w { z1.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "ld1w { z2.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "fmla z28.s, z22.s, z0.s[1]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z23.s, z0.s[1]\n"
-      "fmla z30.s, z1.s, z0.s[1]\n"
-      "fmla z31.s, z2.s, z0.s[1]\n"
-      "ble 55f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "subs x21, x21, #0x1\n"
-      "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z3.s, z0.s[2]\n"
-      "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z4.s, z0.s[2]\n"
-      "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z5.s, z0.s[2]\n"
-      "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z8.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z6.s, z0.s[2]\n"
-      "ld1w { z9.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "ld1w { z10.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "fmla z28.s, z7.s, z0.s[2]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z8.s, z0.s[2]\n"
-      "fmla z30.s, z9.s, z0.s[2]\n"
-      "fmla z31.s, z10.s, z0.s[2]\n"
-      "ble 55f\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
-      "fmla z24.s, z11.s, z0.s[3]\n"
-      "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
-      "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
-      "fmla z25.s, z12.s, z0.s[3]\n"
-      "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
-      "fmla z26.s, z13.s, z0.s[3]\n"
-      "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
-      "ld1w { z16.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
-      "fmla z27.s, z14.s, z0.s[3]\n"
-      "ld1w { z17.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
-      "ld1w { z18.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
-      "fmla z28.s, z15.s, z0.s[3]\n"
-      "addvl %x[B_ptr], %x[B_ptr], #8\n"
-      "fmla z29.s, z16.s, z0.s[3]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
-      "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
-      "fmla z30.s, z17.s, z0.s[3]\n"
-      "fmla z31.s, z18.s, z0.s[3]\n"
-      "55:"  // Width 8: Multiply loop: multiply skip
-      "prfm pldl1keep, [x20, #0x80]\n"
-      "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
-      "tbz %x[flags], #1, 56f\n"
-      "add x19, %x[args_ptr], %[offset_min]\n"
-      "ld1rw { z17.s }, p2/Z, [x19]\n"
-      "add x19, %x[args_ptr], %[offset_max]\n"
-      "ld1rw { z16.s }, p2/Z, [x19]\n"
-      "fmin z24.s, p2/M, z24.s, z16.s\n"
-      "fmin z25.s, p2/M, z25.s, z16.s\n"
-      "fmin z26.s, p2/M, z26.s, z16.s\n"
-      "fmin z27.s, p2/M, z27.s, z16.s\n"
-      "fmin z28.s, p2/M, z28.s, z16.s\n"
-      "fmax z24.s, p2/M, z24.s, z17.s\n"
-      "fmax z25.s, p2/M, z25.s, z17.s\n"
-      "fmax z26.s, p2/M, z26.s, z17.s\n"
-      "fmax z27.s, p2/M, z27.s, z17.s\n"
-      "fmax z28.s, p2/M, z28.s, z17.s\n"
-      "fmin z29.s, p2/M, z29.s, z16.s\n"
-      "fmin z30.s, p2/M, z30.s, z16.s\n"
-      "fmin z31.s, p2/M, z31.s, z16.s\n"
-      "fmax z29.s, p2/M, z29.s, z17.s\n"
-      "fmax z30.s, p2/M, z30.s, z17.s\n"
-      "fmax z31.s, p2/M, z31.s, z17.s\n"
-      "56:"  // Width 8: No activation
-      "st1w { z24.s }, p2, [%x[output_ptr]]\n"
-      "subs x23, x23, #0x8\n"
-      "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
-      "sub %x[N], %x[N], x24, LSL #3\n"
-      "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
-      "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
-      "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
-      "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
-      "st1w { z30.s }, p2, [%x[output_ptr], #6, MUL VL]\n"
-      "st1w { z31.s }, p1, [%x[output_ptr], #7, MUL VL]\n"
-      "addvl %x[output_ptr], %x[output_ptr], #8\n"
-      "bgt 1b\n"
-      "57:"  // Exit
-
-      : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
-      : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
-      : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
-    );
-}
-
-} // namespace arm_gemm
-
-#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
index 7b0282fa32..6677c23216 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
@@ -22,10 +22,11 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#ifdef ARM_COMPUTE_ENABLE_SVE
 #include "../std_transforms_sve.hpp"
 #include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -43,7 +44,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL( ARGLIST );
 class cls_sve_hybrid_bf16fp32_dot_6x4VL
 {
 public:
-    typedef bfloat16 operand_type;
+    typedef bfloat16 lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
     typedef float result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -69,7 +71,24 @@ public:
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 6, 4, 2> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 2> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 15.83 };
+                case CPUModel::A510:
+                    return { 6.80 };
+                case CPUModel::V1:
+                    return { 31.55 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_bf16fp32_dot_6x4VL;
@@ -81,4 +100,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
index 34a657f64f..b794c21807 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
@@ -162,13 +162,12 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x8\n"
       ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "cmp x26, #0x8\n"
+      "add x25, x25, #0x10\n"
       ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       ".inst 0x646040eb  // bfdot z11.s, z7.h, z0.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
       ".inst 0x646840c8  // bfdot z8.s, z6.h, z0.h[1]\n"
@@ -203,7 +202,6 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
@@ -242,9 +240,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       ".inst 0x647840ca  // bfdot z10.s, z6.h, z0.h[3]\n"
       ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
       "11:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 6b\n"
       "tbz %x[flags], #1, 12f\n"
@@ -348,16 +345,14 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x8\n"
       ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "cmp x26, #0x8\n"
       ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
       ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
@@ -408,9 +403,7 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
       ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
@@ -465,10 +458,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       ".inst 0x647840eb  // bfdot z11.s, z7.h, z0.h[3]\n"
       ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 19b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -602,21 +593,18 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x8\n"
       ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
       "add x23, x23, #0x10\n"
       ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "cmp x26, #0x8\n"
-      ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
       ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
       ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
@@ -681,12 +669,9 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
       ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
-      "add x23, x23, #0x10\n"
       ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
       ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
@@ -756,11 +741,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       ".inst 0x647940ef  // bfdot z15.s, z7.h, z1.h[3]\n"
       ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
       "37:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 32b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -923,26 +905,22 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x8\n"
       ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
       "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x8\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
       ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
       ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
       ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
@@ -1021,19 +999,15 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
       "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
-      "add x22, x22, #0x10\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
       ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
       ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
       ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
@@ -1114,12 +1088,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       ".inst 0x647a40f3  // bfdot z19.s, z7.h, z2.h[3]\n"
       ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
       "50:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 45b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1311,32 +1281,27 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x8\n"
       ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
       "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
       "ld1rqh { z4.h }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
       "add x21, x21, #0x10\n"
       ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x8\n"
       ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
       ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
       ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
       ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
@@ -1428,22 +1393,17 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
       "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
       "ld1rqh { z4.h }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
-      "add x21, x21, #0x10\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
       ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
       ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
       ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
       ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1539,13 +1499,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       ".inst 0x647b40f7  // bfdot z23.s, z7.h, z3.h[3]\n"
       ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
       "63:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 58b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1769,37 +1724,31 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x8\n"
       ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
       "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
       "ld1rqh { z4.h }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
       "ld1rqh { z5.h }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
       "add x20, x20, #0x10\n"
       ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x8\n"
       ".inst 0x646540dc  // bfdot z28.s, z6.h, z5.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       ".inst 0x646540fd  // bfdot z29.s, z7.h, z5.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
       ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       ".inst 0x646140ce  // bfdot z14.s, z6.h, z1.h[0]\n"
       ".inst 0x646240d2  // bfdot z18.s, z6.h, z2.h[0]\n"
       ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
@@ -1905,25 +1854,19 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       ".inst 0x646040c8  // bfdot z8.s, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       ".inst 0x646040e9  // bfdot z9.s, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
       "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
+      ".inst 0x646140cc  // bfdot z12.s, z6.h, z1.h[0]\n"
       "ld1rqh { z4.h }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
       ".inst 0x646140ed  // bfdot z13.s, z7.h, z1.h[0]\n"
       "ld1rqh { z5.h }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      ".inst 0x646240d0  // bfdot z16.s, z6.h, z2.h[0]\n"
       ".inst 0x646340d4  // bfdot z20.s, z6.h, z3.h[0]\n"
-      "add x20, x20, #0x10\n"
-      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
       ".inst 0x646440d8  // bfdot z24.s, z6.h, z4.h[0]\n"
       ".inst 0x646540dc  // bfdot z28.s, z6.h, z5.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x646240f1  // bfdot z17.s, z7.h, z2.h[0]\n"
       ".inst 0x646340f5  // bfdot z21.s, z7.h, z3.h[0]\n"
       ".inst 0x646440f9  // bfdot z25.s, z7.h, z4.h[0]\n"
       ".inst 0x646540fd  // bfdot z29.s, z7.h, z5.h[0]\n"
@@ -2034,14 +1977,8 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
       ".inst 0x647c40fb  // bfdot z27.s, z7.h, z4.h[3]\n"
       ".inst 0x647d40ff  // bfdot z31.s, z7.h, z5.h[3]\n"
       "76:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 71b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -2153,4 +2090,4 @@ void sve_hybrid_bf16fp32_dot_6x4VL (
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp
new file mode 100644
index 0000000000..b8d237ff23
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL.hpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<bfloat16>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_bf16fp32_mmla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_bf16fp32_mmla_6x4VL
+{
+public:
+    typedef bfloat16 lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 24.74 };
+                case CPUModel::A510:
+                    return { 6.74 };
+                case CPUModel::V1:
+                    return { 53.59 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_bf16fp32_mmla_6x4VL;
+    cls_sve_hybrid_bf16fp32_mmla_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..e69293e3f1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp
@@ -0,0 +1,2045 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_hybrid_bf16fp32_mmla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 66f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 53f\n"
+      "beq 40f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 27f\n"
+      "beq 14f\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x9, %x[bias]\n"
+      "mov x28, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x11\n"
+      "cbz x9, 3f\n"
+      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "addvl x9, x9, #4\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "b 5f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 8f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "cmp x26, #0x8\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop head
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "whilelt p0.h, XZR, x26\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "sub x26, x26, #0x8\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "cmp x26, #0x8\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "whilelt p0.h, XZR, x26\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "subs x26, x26, #0x4\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      "ble 11f\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 6b\n"
+      "uzp1 z8.d, z8.d, z12.d\n"
+      "uzp1 z9.d, z9.d, z13.d\n"
+      "uzp1 z10.d, z10.d, z14.d\n"
+      "uzp1 z11.d, z11.d, z15.d\n"
+      "tbz %x[flags], #1, 12f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "12:"  // Height 1: No activation
+      "st1w { z8.s }, p4, [x28]\n"
+      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "13:"  // Height 1: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 2b\n"
+      "b 80f\n"
+      "14:"  // Height 2
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "15:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x11\n"
+      "cbz x9, 16f\n"
+      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "addvl x9, x9, #4\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "b 18f\n"
+      "16:"  // Height 2: no bias
+      "tbz %x[flags], #0, 17f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 18f\n"
+      "17:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "18:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "19:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 21f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "b 21f\n"
+      "20:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "21:"  // Height 2: input setup done
+      "cmp x26, #0x8\n"
+      "ble 23f\n"
+      "22:"  // Height 2: Multiply loop: Main loop head
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "whilelt p0.h, XZR, x26\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "sub x26, x26, #0x8\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "cmp x26, #0x8\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "add x25, x25, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      "bgt 22b\n"
+      "23:"  // Height 2: Multiply loop: Single iteration only
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "whilelt p0.h, XZR, x26\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "subs x26, x26, #0x4\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      "ble 24f\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      "24:"  // Height 2: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 19b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "add x24, x28, x19, LSL #2\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "tbz %x[flags], #1, 25f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z7.s, p5/M, z7.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z7.s, p5/M, z7.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "25:"  // Height 2: No activation
+      "st1w { z7.s }, p4, [x28]\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "26:"  // Height 2: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 15b\n"
+      "b 80f\n"
+      "27:"  // Height 3
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "28:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x11\n"
+      "cbz x9, 29f\n"
+      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "b 31f\n"
+      "29:"  // Height 3: no bias
+      "tbz %x[flags], #0, 30f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x23, x24, x19, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 31f\n"
+      "30:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "31:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "32:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 33f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 34f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "b 34f\n"
+      "33:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "34:"  // Height 3: input setup done
+      "cmp x26, #0x8\n"
+      "ble 36f\n"
+      "35:"  // Height 3: Multiply loop: Main loop head
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "whilelt p0.h, XZR, x26\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "sub x26, x26, #0x8\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "cmp x26, #0x8\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
+      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
+      "add x23, x23, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
+      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
+      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
+      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      "bgt 35b\n"
+      "36:"  // Height 3: Multiply loop: Single iteration only
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "whilelt p0.h, XZR, x26\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "subs x26, x26, #0x4\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
+      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
+      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
+      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      "ble 37f\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
+      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
+      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      "37:"  // Height 3: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 32b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "add x24, x28, x19, LSL #2\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "add x23, x24, x19, LSL #2\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "tbz %x[flags], #1, 38f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z7.s, p5/M, z7.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z7.s, p5/M, z7.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "38:"  // Height 3: No activation
+      "st1w { z7.s }, p4, [x28]\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x23]\n"
+      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "39:"  // Height 3: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 28b\n"
+      "b 80f\n"
+      "40:"  // Height 4
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "41:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x11\n"
+      "cbz x9, 42f\n"
+      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "b 44f\n"
+      "42:"  // Height 4: no bias
+      "tbz %x[flags], #0, 43f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x23, x24, x19, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 44f\n"
+      "43:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "44:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "45:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 46f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 47f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "b 47f\n"
+      "46:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "47:"  // Height 4: input setup done
+      "cmp x26, #0x8\n"
+      "ble 49f\n"
+      "48:"  // Height 4: Multiply loop: Main loop head
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "whilelt p0.h, XZR, x26\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "sub x26, x26, #0x8\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "cmp x26, #0x8\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
+      "add x22, x22, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
+      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
+      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
+      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      "bgt 48b\n"
+      "49:"  // Height 4: Multiply loop: Single iteration only
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "whilelt p0.h, XZR, x26\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "subs x26, x26, #0x4\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
+      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
+      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
+      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      "ble 50f\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
+      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
+      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      "50:"  // Height 4: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 45b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "add x24, x28, x19, LSL #2\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "add x23, x24, x19, LSL #2\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "tbz %x[flags], #1, 51f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z7.s, p5/M, z7.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z7.s, p5/M, z7.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "51:"  // Height 4: No activation
+      "st1w { z7.s }, p4, [x28]\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x23]\n"
+      "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x22]\n"
+      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "52:"  // Height 4: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 41b\n"
+      "b 80f\n"
+      "53:"  // Height 5
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "54:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x11\n"
+      "cbz x9, 55f\n"
+      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z28.d, z12.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z29.d, z13.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z30.d, z14.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z31.d, z15.d\n"
+      "b 57f\n"
+      "55:"  // Height 5: no bias
+      "tbz %x[flags], #0, 56f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x23, x24, x19, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z25.s }, p4/Z, [x21]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z6.d, z31.d\n"
+      "zip2 z31.d, z6.d, z31.d\n"
+      "b 57f\n"
+      "56:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "57:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "58:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 59f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 60f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x21, x21, x19, LSL #1\n"
+      "b 60f\n"
+      "59:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "add x21, x22, x19, LSL #1\n"
+      "60:"  // Height 5: input setup done
+      "cmp x26, #0x8\n"
+      "ble 62f\n"
+      "61:"  // Height 5: Multiply loop: Main loop head
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "whilelt p0.h, XZR, x26\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "sub x26, x26, #0x8\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "cmp x26, #0x8\n"
+      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
+      "ld1rqh { z5.h }, p0/Z, [x21]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "add x24, x24, #0x10\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "add x23, x23, #0x10\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "add x22, x22, #0x10\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
+      ".inst 0x6467e498  // bfmmla z24.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
+      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      ".inst 0x6466e49c  // bfmmla z28.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
+      ".inst 0x6467e499  // bfmmla z25.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6466e49d  // bfmmla z29.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
+      ".inst 0x6467e49a  // bfmmla z26.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
+      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
+      ".inst 0x6466e49e  // bfmmla z30.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
+      ".inst 0x6467e49b  // bfmmla z27.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      ".inst 0x6466e49f  // bfmmla z31.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
+      ".inst 0x6467e4b8  // bfmmla z24.s, z5.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
+      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
+      ".inst 0x6466e4bc  // bfmmla z28.s, z5.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
+      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
+      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
+      ".inst 0x6466e4bd  // bfmmla z29.s, z5.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
+      ".inst 0x6467e4ba  // bfmmla z26.s, z5.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
+      ".inst 0x6466e4be  // bfmmla z30.s, z5.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
+      ".inst 0x6467e4bb  // bfmmla z27.s, z5.h, z7.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      ".inst 0x6466e4bf  // bfmmla z31.s, z5.h, z6.h\n"
+      "bgt 61b\n"
+      "62:"  // Height 5: Multiply loop: Single iteration only
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "whilelt p0.h, XZR, x26\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "subs x26, x26, #0x4\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "ld1rqh { z5.h }, p0/Z, [x21]\n"
+      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
+      ".inst 0x6467e498  // bfmmla z24.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
+      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      ".inst 0x6466e49c  // bfmmla z28.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
+      ".inst 0x6467e499  // bfmmla z25.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6466e49d  // bfmmla z29.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
+      ".inst 0x6467e49a  // bfmmla z26.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
+      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
+      ".inst 0x6466e49e  // bfmmla z30.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
+      ".inst 0x6467e49b  // bfmmla z27.s, z4.h, z7.h\n"
+      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      ".inst 0x6466e49f  // bfmmla z31.s, z4.h, z6.h\n"
+      "ble 63f\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
+      ".inst 0x6467e4b8  // bfmmla z24.s, z5.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
+      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
+      ".inst 0x6466e4bc  // bfmmla z28.s, z5.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
+      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
+      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
+      ".inst 0x6466e4bd  // bfmmla z29.s, z5.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
+      ".inst 0x6467e4ba  // bfmmla z26.s, z5.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
+      ".inst 0x6466e4be  // bfmmla z30.s, z5.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
+      ".inst 0x6467e4bb  // bfmmla z27.s, z5.h, z7.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      ".inst 0x6466e4bf  // bfmmla z31.s, z5.h, z6.h\n"
+      "63:"  // Height 5: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 58b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "add x24, x28, x19, LSL #2\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "add x23, x24, x19, LSL #2\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "tbz %x[flags], #1, 64f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z7.s, p5/M, z7.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z7.s, p5/M, z7.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z1.s\n"
+      "fmax z26.s, p5/M, z26.s, z1.s\n"
+      "fmax z27.s, p5/M, z27.s, z1.s\n"
+      "64:"  // Height 5: No activation
+      "st1w { z7.s }, p4, [x28]\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x23]\n"
+      "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x22]\n"
+      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x21]\n"
+      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+      "65:"  // Height 5: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 54b\n"
+      "b 80f\n"
+      "66:"  // Height 6
+      "ldr x11, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x19, #0x18\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "67:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x11\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x11\n"
+      "cbz x9, 68f\n"
+      "ld1w { z8.s }, p5/Z, [x9]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "ld1w { z9.s }, p5/Z, [x9, #1, MUL VL]\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z10.s }, p5/Z, [x9, #2, MUL VL]\n"
+      "mov z16.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z28.d, z12.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z29.d, z13.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z30.d, z14.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z31.d, z15.d\n"
+      "b 70f\n"
+      "68:"  // Height 6: no bias
+      "tbz %x[flags], #0, 69f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x23, x24, x19, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "ld1w { z25.s }, p4/Z, [x21]\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z6.d, z31.d\n"
+      "zip2 z31.d, z6.d, z31.d\n"
+      "b 70f\n"
+      "69:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "70:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "71:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 72f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 73f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x21, x21, x19, LSL #1\n"
+      "add x20, x20, x19, LSL #1\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "add x21, x22, x19, LSL #1\n"
+      "add x20, x21, x19, LSL #1\n"
+      "73:"  // Height 6: input setup done
+      "cmp x26, #0x8\n"
+      "ble 75f\n"
+      "74:"  // Height 6: Multiply loop: Main loop head
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "whilelt p0.h, XZR, x26\n"
+      "sub x26, x26, #0x8\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "cmp x26, #0x8\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
+      "ld1rqh { z5.h }, p0/Z, [x21]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "ld1rqh { z6.h }, p0/Z, [x20]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
+      "add x20, x20, #0x10\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6467e498  // bfmmla z24.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
+      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      ".inst 0x6466e49c  // bfmmla z28.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
+      ".inst 0x6467e499  // bfmmla z25.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6466e49d  // bfmmla z29.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
+      ".inst 0x6467e49a  // bfmmla z26.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
+      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
+      ".inst 0x6466e49e  // bfmmla z30.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #16\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
+      ".inst 0x6467e49b  // bfmmla z27.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-8, MUL VL]\n"
+      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      ".inst 0x6466e49f  // bfmmla z31.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-7, MUL VL]\n"
+      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
+      ".inst 0x6467e4b8  // bfmmla z24.s, z5.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-6, MUL VL]\n"
+      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
+      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
+      ".inst 0x6466e4bc  // bfmmla z28.s, z5.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-5, MUL VL]\n"
+      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
+      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-4, MUL VL]\n"
+      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
+      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
+      ".inst 0x6466e4bd  // bfmmla z29.s, z5.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-3, MUL VL]\n"
+      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
+      ".inst 0x6467e4ba  // bfmmla z26.s, z5.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #-2, MUL VL]\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
+      ".inst 0x6466e4be  // bfmmla z30.s, z5.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #-1, MUL VL]\n"
+      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
+      ".inst 0x6467e4bb  // bfmmla z27.s, z5.h, z7.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      ".inst 0x6466e4bf  // bfmmla z31.s, z5.h, z6.h\n"
+      "bgt 74b\n"
+      "75:"  // Height 6: Multiply loop: Single iteration only
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      "whilelt p0.h, XZR, x26\n"
+      "subs x26, x26, #0x4\n"
+      "ld1rqh { z1.h }, p0/Z, [x25]\n"
+      "ld1rqh { z2.h }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqh { z3.h }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqh { z4.h }, p0/Z, [x22]\n"
+      "ld1rqh { z5.h }, p0/Z, [x21]\n"
+      ".inst 0x6467e408  // bfmmla z8.s, z0.h, z7.h\n"
+      "ld1rqh { z6.h }, p0/Z, [x20]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6467e450  // bfmmla z16.s, z2.h, z7.h\n"
+      ".inst 0x6467e498  // bfmmla z24.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e40c  // bfmmla z12.s, z0.h, z6.h\n"
+      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      ".inst 0x6466e49c  // bfmmla z28.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e409  // bfmmla z9.s, z0.h, z7.h\n"
+      ".inst 0x6467e451  // bfmmla z17.s, z2.h, z7.h\n"
+      ".inst 0x6467e499  // bfmmla z25.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6466e49d  // bfmmla z29.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e40a  // bfmmla z10.s, z0.h, z7.h\n"
+      ".inst 0x6467e452  // bfmmla z18.s, z2.h, z7.h\n"
+      ".inst 0x6467e49a  // bfmmla z26.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e40e  // bfmmla z14.s, z0.h, z6.h\n"
+      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
+      ".inst 0x6466e49e  // bfmmla z30.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6467e453  // bfmmla z19.s, z2.h, z7.h\n"
+      ".inst 0x6467e49b  // bfmmla z27.s, z4.h, z7.h\n"
+      ".inst 0x6466e40f  // bfmmla z15.s, z0.h, z6.h\n"
+      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      ".inst 0x6466e49f  // bfmmla z31.s, z4.h, z6.h\n"
+      "ble 76f\n"
+      "ld1h { z7.h }, p5/Z, [x10]\n"
+      ".inst 0x6467e428  // bfmmla z8.s, z1.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #1, MUL VL]\n"
+      ".inst 0x6467e470  // bfmmla z16.s, z3.h, z7.h\n"
+      ".inst 0x6467e4b8  // bfmmla z24.s, z5.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #2, MUL VL]\n"
+      ".inst 0x6466e42c  // bfmmla z12.s, z1.h, z6.h\n"
+      ".inst 0x6466e474  // bfmmla z20.s, z3.h, z6.h\n"
+      ".inst 0x6466e4bc  // bfmmla z28.s, z5.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #3, MUL VL]\n"
+      ".inst 0x6467e429  // bfmmla z9.s, z1.h, z7.h\n"
+      ".inst 0x6467e471  // bfmmla z17.s, z3.h, z7.h\n"
+      ".inst 0x6467e4b9  // bfmmla z25.s, z5.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #4, MUL VL]\n"
+      ".inst 0x6466e42d  // bfmmla z13.s, z1.h, z6.h\n"
+      ".inst 0x6466e475  // bfmmla z21.s, z3.h, z6.h\n"
+      ".inst 0x6466e4bd  // bfmmla z29.s, z5.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #5, MUL VL]\n"
+      ".inst 0x6467e42a  // bfmmla z10.s, z1.h, z7.h\n"
+      ".inst 0x6467e472  // bfmmla z18.s, z3.h, z7.h\n"
+      ".inst 0x6467e4ba  // bfmmla z26.s, z5.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x10, #6, MUL VL]\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      ".inst 0x6466e476  // bfmmla z22.s, z3.h, z6.h\n"
+      ".inst 0x6466e4be  // bfmmla z30.s, z5.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x10, #7, MUL VL]\n"
+      "addvl x10, x10, #8\n"
+      ".inst 0x6467e42b  // bfmmla z11.s, z1.h, z7.h\n"
+      ".inst 0x6467e473  // bfmmla z19.s, z3.h, z7.h\n"
+      ".inst 0x6467e4bb  // bfmmla z27.s, z5.h, z7.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6466e477  // bfmmla z23.s, z3.h, z6.h\n"
+      ".inst 0x6466e4bf  // bfmmla z31.s, z5.h, z6.h\n"
+      "76:"  // Height 6: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 71b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "add x24, x28, x19, LSL #2\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "add x23, x24, x19, LSL #2\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "add x20, x21, x19, LSL #2\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "tbz %x[flags], #1, 77f\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z7.s, p5/M, z7.s, z0.s\n"
+      "fmin z12.s, p5/M, z12.s, z0.s\n"
+      "fmin z13.s, p5/M, z13.s, z0.s\n"
+      "fmin z14.s, p5/M, z14.s, z0.s\n"
+      "fmin z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z7.s, p5/M, z7.s, z1.s\n"
+      "fmax z12.s, p5/M, z12.s, z1.s\n"
+      "fmax z13.s, p5/M, z13.s, z1.s\n"
+      "fmax z14.s, p5/M, z14.s, z1.s\n"
+      "fmax z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z0.s\n"
+      "fmin z10.s, p5/M, z10.s, z0.s\n"
+      "fmin z11.s, p5/M, z11.s, z0.s\n"
+      "fmin z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z1.s\n"
+      "fmax z10.s, p5/M, z10.s, z1.s\n"
+      "fmax z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z15.s, p5/M, z15.s, z1.s\n"
+      "fmin z20.s, p5/M, z20.s, z0.s\n"
+      "fmin z21.s, p5/M, z21.s, z0.s\n"
+      "fmin z22.s, p5/M, z22.s, z0.s\n"
+      "fmin z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z20.s, p5/M, z20.s, z1.s\n"
+      "fmax z21.s, p5/M, z21.s, z1.s\n"
+      "fmax z22.s, p5/M, z22.s, z1.s\n"
+      "fmax z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z0.s\n"
+      "fmin z18.s, p5/M, z18.s, z0.s\n"
+      "fmin z19.s, p5/M, z19.s, z0.s\n"
+      "fmin z23.s, p5/M, z23.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z1.s\n"
+      "fmax z18.s, p5/M, z18.s, z1.s\n"
+      "fmax z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z23.s, p5/M, z23.s, z1.s\n"
+      "fmin z28.s, p5/M, z28.s, z0.s\n"
+      "fmin z29.s, p5/M, z29.s, z0.s\n"
+      "fmin z30.s, p5/M, z30.s, z0.s\n"
+      "fmin z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z28.s, p5/M, z28.s, z1.s\n"
+      "fmax z29.s, p5/M, z29.s, z1.s\n"
+      "fmax z30.s, p5/M, z30.s, z1.s\n"
+      "fmax z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z0.s\n"
+      "fmin z26.s, p5/M, z26.s, z0.s\n"
+      "fmin z27.s, p5/M, z27.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z1.s\n"
+      "fmax z26.s, p5/M, z26.s, z1.s\n"
+      "fmax z27.s, p5/M, z27.s, z1.s\n"
+      "77:"  // Height 6: No activation
+      "st1w { z7.s }, p4, [x28]\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x23]\n"
+      "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x22]\n"
+      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z23.s }, p4, [x21]\n"
+      "st1w { z28.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z29.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z30.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x20]\n"
+      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
+      "78:"  // Height 6: Writeback done
+      "decw x11, ALL, MUL #4\n"
+      "cmp x11, XZR\n"
+      "bgt 67b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 80f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 79f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "79:"  // Update direct input
+      "mov x19, #0xc\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "80:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
index f98ccdc7d3..6db9c0cdf3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
@@ -25,6 +25,7 @@
 #ifdef ARM_COMPUTE_ENABLE_SVE
 
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -38,11 +39,13 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void sve_hybrid_fp16_mla_6x4VL( ARGLIST );
+void sve_hybrid_fp16_mla_6x4VL_a64fx( ARGLIST );
 
 class cls_sve_hybrid_fp16_mla_6x4VL
 {
 public:
-    typedef __fp16 operand_type;
+    typedef __fp16 lhs_operand_type;
+    typedef __fp16 rhs_operand_type;
     typedef __fp16 result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,16 +71,41 @@ public:
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 6, 4, 1> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, __fp16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 15.90 };
+                case CPUModel::A510:
+                    return { 12.44 };
+                case CPUModel::V1:
+                    return { 31.51 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_fp16_mla_6x4VL;
-    cls_sve_hybrid_fp16_mla_6x4VL(const CPUInfo *)
+    cls_sve_hybrid_fp16_mla_6x4VL(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_hybrid_fp16_mla_6x4VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
 #undef ARGLIST
-#endif // ARM_COMPUTE_ENABLE_SVE
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp
new file mode 100644
index 0000000000..11f5ed2c0a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp
@@ -0,0 +1,1366 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp16_mla_6x4VL_a64fx (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+    size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg,
+    const __fp16 *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const __fp16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<__fp16>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p4.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 61f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 49f\n"
+      "beq 37f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 25f\n"
+      "beq 13f\n"
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p0.h, x19, x10\n"
+      "cbz x11, 3f\n"
+      "ld1h { z8.h }, p4/Z, [x11]\n"
+      "ld1h { z9.h }, p4/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z10.h }, p4/Z, [x11, #2, MUL VL]\n"
+      "ld1h { z11.h }, p4/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "b 5f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 4f\n"
+      "ld1h { z8.h }, p3/Z, [x28]\n"
+      "ld1h { z9.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x28, #3, MUL VL]\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 8f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "subs x26, x26, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x25]\n"
+      "ld1h { z6.h }, p4/Z, [x9]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "add x25, x25, #0x2\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "subs x26, x26, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x25]\n"
+      "ld1h { z6.h }, p4/Z, [x9]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "addvl x9, x9, #4\n"
+      "bne 6b\n"
+      "tbz %x[flags], #1, 11f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z1.h }, p4/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z0.h }, p4/Z, [x19]\n"
+      "fmin z8.h, p4/M, z8.h, z1.h\n"
+      "fmin z9.h, p4/M, z9.h, z1.h\n"
+      "fmin z10.h, p4/M, z10.h, z1.h\n"
+      "fmin z11.h, p4/M, z11.h, z1.h\n"
+      "fmax z8.h, p4/M, z8.h, z0.h\n"
+      "fmax z9.h, p4/M, z9.h, z0.h\n"
+      "fmax z10.h, p4/M, z10.h, z0.h\n"
+      "fmax z11.h, p4/M, z11.h, z0.h\n"
+      "11:"  // Height 1: No activation
+      "st1h { z8.h }, p3, [x28]\n"
+      "st1h { z9.h }, p2, [x28, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x28, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "12:"  // Height 1: Writeback done
+      "dech x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 2b\n"
+      "b 74f\n"
+      "13:"  // Height 2
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "14:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p0.h, x19, x10\n"
+      "cbz x11, 15f\n"
+      "ld1h { z8.h }, p4/Z, [x11]\n"
+      "ld1h { z9.h }, p4/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z10.h }, p4/Z, [x11, #2, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z11.h }, p4/Z, [x11, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x11, x11, #4\n"
+      "b 17f\n"
+      "15:"  // Height 2: no bias
+      "tbz %x[flags], #0, 16f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x28]\n"
+      "ld1h { z9.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x24]\n"
+      "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "b 17f\n"
+      "16:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 20f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "20:"  // Height 2: input setup done
+      "subs x26, x26, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x25]\n"
+      "ld1rh { z1.h }, p4/Z, [x24]\n"
+      "ld1h { z6.h }, p4/Z, [x9]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 22f\n"
+      "21:"  // Height 2: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x25, x25, #0x2\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "subs x26, x26, #0x1\n"
+      "add x24, x24, #0x2\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "ld1rh { z0.h }, p4/Z, [x25]\n"
+      "ld1rh { z1.h }, p4/Z, [x24]\n"
+      "ld1h { z6.h }, p4/Z, [x9]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 21b\n"
+      "22:"  // Height 2: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "addvl x9, x9, #4\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "bne 18b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #1\n"
+      "tbz %x[flags], #1, 23f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z1.h }, p4/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z0.h }, p4/Z, [x19]\n"
+      "fmin z8.h, p4/M, z8.h, z1.h\n"
+      "fmin z9.h, p4/M, z9.h, z1.h\n"
+      "fmin z10.h, p4/M, z10.h, z1.h\n"
+      "fmin z11.h, p4/M, z11.h, z1.h\n"
+      "fmin z12.h, p4/M, z12.h, z1.h\n"
+      "fmin z13.h, p4/M, z13.h, z1.h\n"
+      "fmin z14.h, p4/M, z14.h, z1.h\n"
+      "fmin z15.h, p4/M, z15.h, z1.h\n"
+      "fmax z8.h, p4/M, z8.h, z0.h\n"
+      "fmax z9.h, p4/M, z9.h, z0.h\n"
+      "fmax z10.h, p4/M, z10.h, z0.h\n"
+      "fmax z11.h, p4/M, z11.h, z0.h\n"
+      "fmax z12.h, p4/M, z12.h, z0.h\n"
+      "fmax z13.h, p4/M, z13.h, z0.h\n"
+      "fmax z14.h, p4/M, z14.h, z0.h\n"
+      "fmax z15.h, p4/M, z15.h, z0.h\n"
+      "23:"  // Height 2: No activation
+      "st1h { z8.h }, p3, [x28]\n"
+      "st1h { z9.h }, p2, [x28, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x28, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1h { z12.h }, p3, [x24]\n"
+      "st1h { z13.h }, p2, [x24, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x24, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x24, #3, MUL VL]\n"
+      "24:"  // Height 2: Writeback done
+      "dech x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 14b\n"
+      "b 74f\n"
+      "25:"  // Height 3
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "26:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p0.h, x19, x10\n"
+      "cbz x11, 27f\n"
+      "ld1h { z8.h }, p4/Z, [x11]\n"
+      "ld1h { z9.h }, p4/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z10.h }, p4/Z, [x11, #2, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z11.h }, p4/Z, [x11, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x11, x11, #4\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 29f\n"
+      "27:"  // Height 3: no bias
+      "tbz %x[flags], #0, 28f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x28]\n"
+      "ld1h { z9.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x24]\n"
+      "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x23]\n"
+      "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "b 29f\n"
+      "28:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "29:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "30:"  // Height 3: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 31f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 32f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "b 32f\n"
+      "31:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "32:"  // Height 3: input setup done
+      "subs x26, x26, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x25]\n"
+      "ld1rh { z1.h }, p4/Z, [x24]\n"
+      "ld1rh { z2.h }, p4/Z, [x23]\n"
+      "ld1h { z6.h }, p4/Z, [x9]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 34f\n"
+      "33:"  // Height 3: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x25, x25, #0x2\n"
+      "subs x26, x26, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x24, x24, #0x2\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "add x23, x23, #0x2\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z18.h, p4/M, z6.h, z2.h\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "ld1rh { z0.h }, p4/Z, [x25]\n"
+      "ld1h { z6.h }, p4/Z, [x9]\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "ld1rh { z1.h }, p4/Z, [x24]\n"
+      "ld1rh { z2.h }, p4/Z, [x23]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 33b\n"
+      "34:"  // Height 3: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x27, x27, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n"
+      "cmp x27, x19\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z18.h, p4/M, z6.h, z2.h\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "bne 30b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "tbz %x[flags], #1, 35f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z1.h }, p4/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z0.h }, p4/Z, [x19]\n"
+      "fmin z8.h, p4/M, z8.h, z1.h\n"
+      "fmin z9.h, p4/M, z9.h, z1.h\n"
+      "fmin z10.h, p4/M, z10.h, z1.h\n"
+      "fmin z11.h, p4/M, z11.h, z1.h\n"
+      "fmin z12.h, p4/M, z12.h, z1.h\n"
+      "fmin z13.h, p4/M, z13.h, z1.h\n"
+      "fmin z14.h, p4/M, z14.h, z1.h\n"
+      "fmin z15.h, p4/M, z15.h, z1.h\n"
+      "fmin z16.h, p4/M, z16.h, z1.h\n"
+      "fmin z17.h, p4/M, z17.h, z1.h\n"
+      "fmin z18.h, p4/M, z18.h, z1.h\n"
+      "fmin z19.h, p4/M, z19.h, z1.h\n"
+      "fmax z8.h, p4/M, z8.h, z0.h\n"
+      "fmax z9.h, p4/M, z9.h, z0.h\n"
+      "fmax z10.h, p4/M, z10.h, z0.h\n"
+      "fmax z11.h, p4/M, z11.h, z0.h\n"
+      "fmax z12.h, p4/M, z12.h, z0.h\n"
+      "fmax z13.h, p4/M, z13.h, z0.h\n"
+      "fmax z14.h, p4/M, z14.h, z0.h\n"
+      "fmax z15.h, p4/M, z15.h, z0.h\n"
+      "fmax z16.h, p4/M, z16.h, z0.h\n"
+      "fmax z17.h, p4/M, z17.h, z0.h\n"
+      "fmax z18.h, p4/M, z18.h, z0.h\n"
+      "fmax z19.h, p4/M, z19.h, z0.h\n"
+      "35:"  // Height 3: No activation
+      "st1h { z8.h }, p3, [x28]\n"
+      "st1h { z9.h }, p2, [x28, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x28, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1h { z12.h }, p3, [x24]\n"
+      "st1h { z13.h }, p2, [x24, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x24, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x24, #3, MUL VL]\n"
+      "st1h { z16.h }, p3, [x23]\n"
+      "st1h { z17.h }, p2, [x23, #1, MUL VL]\n"
+      "st1h { z18.h }, p1, [x23, #2, MUL VL]\n"
+      "st1h { z19.h }, p0, [x23, #3, MUL VL]\n"
+      "36:"  // Height 3: Writeback done
+      "dech x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 26b\n"
+      "b 74f\n"
+      "37:"  // Height 4
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "38:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p0.h, x19, x10\n"
+      "cbz x11, 39f\n"
+      "ld1h { z8.h }, p4/Z, [x11]\n"
+      "ld1h { z9.h }, p4/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z10.h }, p4/Z, [x11, #2, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z11.h }, p4/Z, [x11, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x11, x11, #4\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 41f\n"
+      "39:"  // Height 4: no bias
+      "tbz %x[flags], #0, 40f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x28]\n"
+      "ld1h { z9.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x24]\n"
+      "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x23]\n"
+      "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x22]\n"
+      "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "b 41f\n"
+      "40:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "41:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "42:"  // Height 4: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 43f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 44f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "b 44f\n"
+      "43:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "44:"  // Height 4: input setup done
+      "subs x26, x26, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x25]\n"
+      "ld1rh { z1.h }, p4/Z, [x24]\n"
+      "ld1rh { z2.h }, p4/Z, [x23]\n"
+      "ld1rh { z3.h }, p4/Z, [x22]\n"
+      "ld1h { z6.h }, p4/Z, [x9]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 46f\n"
+      "45:"  // Height 4: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x25, x25, #0x2\n"
+      "subs x26, x26, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x24, x24, #0x2\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "add x23, x23, #0x2\n"
+      "add x22, x22, #0x2\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z18.h, p4/M, z6.h, z2.h\n"
+      "fmla z22.h, p4/M, z6.h, z3.h\n"
+      "ld1h { z6.h }, p4/Z, [x9]\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "ld1rh { z0.h }, p4/Z, [x25]\n"
+      "ld1rh { z1.h }, p4/Z, [x24]\n"
+      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "ld1rh { z2.h }, p4/Z, [x23]\n"
+      "ld1rh { z3.h }, p4/Z, [x22]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 45b\n"
+      "46:"  // Height 4: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x27, x27, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n"
+      "cmp x27, x19\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z18.h, p4/M, z6.h, z2.h\n"
+      "fmla z22.h, p4/M, z6.h, z3.h\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "bne 42b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "tbz %x[flags], #1, 47f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z1.h }, p4/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z0.h }, p4/Z, [x19]\n"
+      "fmin z8.h, p4/M, z8.h, z1.h\n"
+      "fmin z9.h, p4/M, z9.h, z1.h\n"
+      "fmin z10.h, p4/M, z10.h, z1.h\n"
+      "fmin z11.h, p4/M, z11.h, z1.h\n"
+      "fmin z12.h, p4/M, z12.h, z1.h\n"
+      "fmin z13.h, p4/M, z13.h, z1.h\n"
+      "fmin z14.h, p4/M, z14.h, z1.h\n"
+      "fmin z15.h, p4/M, z15.h, z1.h\n"
+      "fmin z16.h, p4/M, z16.h, z1.h\n"
+      "fmin z17.h, p4/M, z17.h, z1.h\n"
+      "fmin z18.h, p4/M, z18.h, z1.h\n"
+      "fmin z19.h, p4/M, z19.h, z1.h\n"
+      "fmin z20.h, p4/M, z20.h, z1.h\n"
+      "fmin z21.h, p4/M, z21.h, z1.h\n"
+      "fmin z22.h, p4/M, z22.h, z1.h\n"
+      "fmin z23.h, p4/M, z23.h, z1.h\n"
+      "fmax z8.h, p4/M, z8.h, z0.h\n"
+      "fmax z9.h, p4/M, z9.h, z0.h\n"
+      "fmax z10.h, p4/M, z10.h, z0.h\n"
+      "fmax z11.h, p4/M, z11.h, z0.h\n"
+      "fmax z12.h, p4/M, z12.h, z0.h\n"
+      "fmax z13.h, p4/M, z13.h, z0.h\n"
+      "fmax z14.h, p4/M, z14.h, z0.h\n"
+      "fmax z15.h, p4/M, z15.h, z0.h\n"
+      "fmax z16.h, p4/M, z16.h, z0.h\n"
+      "fmax z17.h, p4/M, z17.h, z0.h\n"
+      "fmax z18.h, p4/M, z18.h, z0.h\n"
+      "fmax z19.h, p4/M, z19.h, z0.h\n"
+      "fmax z20.h, p4/M, z20.h, z0.h\n"
+      "fmax z21.h, p4/M, z21.h, z0.h\n"
+      "fmax z22.h, p4/M, z22.h, z0.h\n"
+      "fmax z23.h, p4/M, z23.h, z0.h\n"
+      "47:"  // Height 4: No activation
+      "st1h { z8.h }, p3, [x28]\n"
+      "st1h { z9.h }, p2, [x28, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x28, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1h { z12.h }, p3, [x24]\n"
+      "st1h { z13.h }, p2, [x24, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x24, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x24, #3, MUL VL]\n"
+      "st1h { z16.h }, p3, [x23]\n"
+      "st1h { z17.h }, p2, [x23, #1, MUL VL]\n"
+      "st1h { z18.h }, p1, [x23, #2, MUL VL]\n"
+      "st1h { z19.h }, p0, [x23, #3, MUL VL]\n"
+      "st1h { z20.h }, p3, [x22]\n"
+      "st1h { z21.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z22.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z23.h }, p0, [x22, #3, MUL VL]\n"
+      "48:"  // Height 4: Writeback done
+      "dech x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 38b\n"
+      "b 74f\n"
+      "49:"  // Height 5
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "50:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p0.h, x19, x10\n"
+      "cbz x11, 51f\n"
+      "ld1h { z8.h }, p4/Z, [x11]\n"
+      "ld1h { z9.h }, p4/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z10.h }, p4/Z, [x11, #2, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z11.h }, p4/Z, [x11, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x11, x11, #4\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 53f\n"
+      "51:"  // Height 5: no bias
+      "tbz %x[flags], #0, 52f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "add x21, x22, x19, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x28]\n"
+      "ld1h { z9.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1h { z10.h }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x24]\n"
+      "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x23]\n"
+      "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x22]\n"
+      "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z24.h }, p3/Z, [x21]\n"
+      "ld1h { z25.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z26.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z27.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "b 53f\n"
+      "52:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "53:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "54:"  // Height 5: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 55f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 56f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x21, x21, x19, LSL #1\n"
+      "b 56f\n"
+      "55:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "add x21, x22, x19, LSL #1\n"
+      "56:"  // Height 5: input setup done
+      "subs x26, x26, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x25]\n"
+      "ld1rh { z1.h }, p4/Z, [x24]\n"
+      "ld1rh { z2.h }, p4/Z, [x23]\n"
+      "ld1rh { z3.h }, p4/Z, [x22]\n"
+      "ld1rh { z4.h }, p4/Z, [x21]\n"
+      "ld1h { z6.h }, p4/Z, [x9]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 58f\n"
+      "57:"  // Height 5: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x25, x25, #0x2\n"
+      "subs x26, x26, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "add x24, x24, #0x2\n"
+      "add x23, x23, #0x2\n"
+      "fmla z24.h, p4/M, z6.h, z4.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x22, x22, #0x2\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "add x21, x21, #0x2\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "fmla z25.h, p4/M, z7.h, z4.h\n"
+      "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z18.h, p4/M, z6.h, z2.h\n"
+      "fmla z22.h, p4/M, z6.h, z3.h\n"
+      "fmla z26.h, p4/M, z6.h, z4.h\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "ld1rh { z0.h }, p4/Z, [x25]\n"
+      "ld1h { z6.h }, p4/Z, [x9]\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "ld1rh { z1.h }, p4/Z, [x24]\n"
+      "ld1rh { z2.h }, p4/Z, [x23]\n"
+      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "fmla z27.h, p4/M, z7.h, z4.h\n"
+      "ld1rh { z3.h }, p4/Z, [x22]\n"
+      "ld1rh { z4.h }, p4/Z, [x21]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 57b\n"
+      "58:"  // Height 5: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x27, x27, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "cmp x27, x19\n"
+      "fmla z24.h, p4/M, z6.h, z4.h\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "fmla z25.h, p4/M, z7.h, z4.h\n"
+      "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z18.h, p4/M, z6.h, z2.h\n"
+      "fmla z22.h, p4/M, z6.h, z3.h\n"
+      "fmla z26.h, p4/M, z6.h, z4.h\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "fmla z27.h, p4/M, z7.h, z4.h\n"
+      "bne 54b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "add x21, x22, x19, LSL #1\n"
+      "tbz %x[flags], #1, 59f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z1.h }, p4/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z0.h }, p4/Z, [x19]\n"
+      "fmin z8.h, p4/M, z8.h, z1.h\n"
+      "fmin z9.h, p4/M, z9.h, z1.h\n"
+      "fmin z10.h, p4/M, z10.h, z1.h\n"
+      "fmin z11.h, p4/M, z11.h, z1.h\n"
+      "fmin z12.h, p4/M, z12.h, z1.h\n"
+      "fmin z13.h, p4/M, z13.h, z1.h\n"
+      "fmin z14.h, p4/M, z14.h, z1.h\n"
+      "fmin z15.h, p4/M, z15.h, z1.h\n"
+      "fmin z16.h, p4/M, z16.h, z1.h\n"
+      "fmin z17.h, p4/M, z17.h, z1.h\n"
+      "fmin z18.h, p4/M, z18.h, z1.h\n"
+      "fmin z19.h, p4/M, z19.h, z1.h\n"
+      "fmin z20.h, p4/M, z20.h, z1.h\n"
+      "fmin z21.h, p4/M, z21.h, z1.h\n"
+      "fmin z22.h, p4/M, z22.h, z1.h\n"
+      "fmin z23.h, p4/M, z23.h, z1.h\n"
+      "fmin z24.h, p4/M, z24.h, z1.h\n"
+      "fmin z25.h, p4/M, z25.h, z1.h\n"
+      "fmin z26.h, p4/M, z26.h, z1.h\n"
+      "fmin z27.h, p4/M, z27.h, z1.h\n"
+      "fmax z8.h, p4/M, z8.h, z0.h\n"
+      "fmax z9.h, p4/M, z9.h, z0.h\n"
+      "fmax z10.h, p4/M, z10.h, z0.h\n"
+      "fmax z11.h, p4/M, z11.h, z0.h\n"
+      "fmax z12.h, p4/M, z12.h, z0.h\n"
+      "fmax z13.h, p4/M, z13.h, z0.h\n"
+      "fmax z14.h, p4/M, z14.h, z0.h\n"
+      "fmax z15.h, p4/M, z15.h, z0.h\n"
+      "fmax z16.h, p4/M, z16.h, z0.h\n"
+      "fmax z17.h, p4/M, z17.h, z0.h\n"
+      "fmax z18.h, p4/M, z18.h, z0.h\n"
+      "fmax z19.h, p4/M, z19.h, z0.h\n"
+      "fmax z20.h, p4/M, z20.h, z0.h\n"
+      "fmax z21.h, p4/M, z21.h, z0.h\n"
+      "fmax z22.h, p4/M, z22.h, z0.h\n"
+      "fmax z23.h, p4/M, z23.h, z0.h\n"
+      "fmax z24.h, p4/M, z24.h, z0.h\n"
+      "fmax z25.h, p4/M, z25.h, z0.h\n"
+      "fmax z26.h, p4/M, z26.h, z0.h\n"
+      "fmax z27.h, p4/M, z27.h, z0.h\n"
+      "59:"  // Height 5: No activation
+      "st1h { z8.h }, p3, [x28]\n"
+      "st1h { z9.h }, p2, [x28, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x28, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1h { z12.h }, p3, [x24]\n"
+      "st1h { z13.h }, p2, [x24, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x24, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x24, #3, MUL VL]\n"
+      "st1h { z16.h }, p3, [x23]\n"
+      "st1h { z17.h }, p2, [x23, #1, MUL VL]\n"
+      "st1h { z18.h }, p1, [x23, #2, MUL VL]\n"
+      "st1h { z19.h }, p0, [x23, #3, MUL VL]\n"
+      "st1h { z20.h }, p3, [x22]\n"
+      "st1h { z21.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z22.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z23.h }, p0, [x22, #3, MUL VL]\n"
+      "st1h { z24.h }, p3, [x21]\n"
+      "st1h { z25.h }, p2, [x21, #1, MUL VL]\n"
+      "st1h { z26.h }, p1, [x21, #2, MUL VL]\n"
+      "st1h { z27.h }, p0, [x21, #3, MUL VL]\n"
+      "60:"  // Height 5: Writeback done
+      "dech x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 50b\n"
+      "b 74f\n"
+      "61:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x19, #0xc\n"
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "62:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p2.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p1.h, x19, x10\n"
+      "inch x19\n"
+      "whilelt p0.h, x19, x10\n"
+      "cbz x11, 63f\n"
+      "ld1h { z8.h }, p4/Z, [x11]\n"
+      "ld1h { z9.h }, p4/Z, [x11, #1, MUL VL]\n"
+      "ld1h { z10.h }, p4/Z, [x11, #2, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1h { z11.h }, p4/Z, [x11, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x11, x11, #4\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 65f\n"
+      "63:"  // Height 6: no bias
+      "tbz %x[flags], #0, 64f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "add x21, x22, x19, LSL #1\n"
+      "ld1h { z8.h }, p3/Z, [x28]\n"
+      "ld1h { z9.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "add x20, x21, x19, LSL #1\n"
+      "ld1h { z10.h }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z11.h }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1h { z12.h }, p3/Z, [x24]\n"
+      "ld1h { z13.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1h { z14.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1h { z15.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x23]\n"
+      "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z18.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1h { z20.h }, p3/Z, [x22]\n"
+      "ld1h { z21.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z22.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z23.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1h { z24.h }, p3/Z, [x21]\n"
+      "ld1h { z25.h }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1h { z26.h }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1h { z27.h }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1h { z28.h }, p3/Z, [x20]\n"
+      "ld1h { z29.h }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z30.h }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z31.h }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 65f\n"
+      "64:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "65:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "66:"  // Height 6: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 68f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #1\n"
+      "add x24, x24, x19, LSL #1\n"
+      "add x23, x23, x19, LSL #1\n"
+      "add x22, x22, x19, LSL #1\n"
+      "add x21, x21, x19, LSL #1\n"
+      "add x20, x20, x19, LSL #1\n"
+      "b 68f\n"
+      "67:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "add x21, x22, x19, LSL #1\n"
+      "add x20, x21, x19, LSL #1\n"
+      "68:"  // Height 6: input setup done
+      "subs x26, x26, #0x1\n"
+      "ld1rh { z0.h }, p4/Z, [x25]\n"
+      "ld1rh { z1.h }, p4/Z, [x24]\n"
+      "ld1rh { z2.h }, p4/Z, [x23]\n"
+      "ld1rh { z3.h }, p4/Z, [x22]\n"
+      "ld1rh { z4.h }, p4/Z, [x21]\n"
+      "ld1rh { z5.h }, p4/Z, [x20]\n"
+      "ld1h { z6.h }, p4/Z, [x9]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 70f\n"
+      "69:"  // Height 6: Multiply loop: Main loop
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x25, x25, #0x2\n"
+      "subs x26, x26, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "add x24, x24, #0x2\n"
+      "add x23, x23, #0x2\n"
+      "fmla z24.h, p4/M, z6.h, z4.h\n"
+      "fmla z28.h, p4/M, z6.h, z5.h\n"
+      "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x22, x22, #0x2\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "add x21, x21, #0x2\n"
+      "add x20, x20, #0x2\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "fmla z25.h, p4/M, z7.h, z4.h\n"
+      "fmla z29.h, p4/M, z7.h, z5.h\n"
+      "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z18.h, p4/M, z6.h, z2.h\n"
+      "fmla z22.h, p4/M, z6.h, z3.h\n"
+      "fmla z26.h, p4/M, z6.h, z4.h\n"
+      "fmla z30.h, p4/M, z6.h, z5.h\n"
+      "ld1h { z6.h }, p4/Z, [x9]\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "ld1rh { z0.h }, p4/Z, [x25]\n"
+      "ld1rh { z1.h }, p4/Z, [x24]\n"
+      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "ld1rh { z2.h }, p4/Z, [x23]\n"
+      "ld1rh { z3.h }, p4/Z, [x22]\n"
+      "fmla z27.h, p4/M, z7.h, z4.h\n"
+      "fmla z31.h, p4/M, z7.h, z5.h\n"
+      "ld1rh { z4.h }, p4/Z, [x21]\n"
+      "ld1rh { z5.h }, p4/Z, [x20]\n"
+      "ld1h { z7.h }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 69b\n"
+      "70:"  // Height 6: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.h, p4/M, z6.h, z0.h\n"
+      "fmla z12.h, p4/M, z6.h, z1.h\n"
+      "add x27, x27, #0x1\n"
+      "fmla z16.h, p4/M, z6.h, z2.h\n"
+      "fmla z20.h, p4/M, z6.h, z3.h\n"
+      "cmp x27, x19\n"
+      "fmla z24.h, p4/M, z6.h, z4.h\n"
+      "fmla z28.h, p4/M, z6.h, z5.h\n"
+      "ld1h { z6.h }, p4/Z, [x9, #2, MUL VL]\n"
+      "fmla z9.h, p4/M, z7.h, z0.h\n"
+      "fmla z13.h, p4/M, z7.h, z1.h\n"
+      "fmla z17.h, p4/M, z7.h, z2.h\n"
+      "fmla z21.h, p4/M, z7.h, z3.h\n"
+      "fmla z25.h, p4/M, z7.h, z4.h\n"
+      "fmla z29.h, p4/M, z7.h, z5.h\n"
+      "ld1h { z7.h }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.h, p4/M, z6.h, z0.h\n"
+      "fmla z14.h, p4/M, z6.h, z1.h\n"
+      "fmla z18.h, p4/M, z6.h, z2.h\n"
+      "fmla z22.h, p4/M, z6.h, z3.h\n"
+      "fmla z26.h, p4/M, z6.h, z4.h\n"
+      "fmla z30.h, p4/M, z6.h, z5.h\n"
+      "fmla z11.h, p4/M, z7.h, z0.h\n"
+      "fmla z15.h, p4/M, z7.h, z1.h\n"
+      "fmla z19.h, p4/M, z7.h, z2.h\n"
+      "fmla z23.h, p4/M, z7.h, z3.h\n"
+      "fmla z27.h, p4/M, z7.h, z4.h\n"
+      "fmla z31.h, p4/M, z7.h, z5.h\n"
+      "bne 66b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #1\n"
+      "add x23, x24, x19, LSL #1\n"
+      "add x22, x23, x19, LSL #1\n"
+      "add x21, x22, x19, LSL #1\n"
+      "add x20, x21, x19, LSL #1\n"
+      "tbz %x[flags], #1, 71f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rh { z1.h }, p4/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rh { z0.h }, p4/Z, [x19]\n"
+      "fmin z8.h, p4/M, z8.h, z1.h\n"
+      "fmin z9.h, p4/M, z9.h, z1.h\n"
+      "fmin z10.h, p4/M, z10.h, z1.h\n"
+      "fmin z11.h, p4/M, z11.h, z1.h\n"
+      "fmin z12.h, p4/M, z12.h, z1.h\n"
+      "fmin z13.h, p4/M, z13.h, z1.h\n"
+      "fmin z14.h, p4/M, z14.h, z1.h\n"
+      "fmin z15.h, p4/M, z15.h, z1.h\n"
+      "fmin z16.h, p4/M, z16.h, z1.h\n"
+      "fmin z17.h, p4/M, z17.h, z1.h\n"
+      "fmin z18.h, p4/M, z18.h, z1.h\n"
+      "fmin z19.h, p4/M, z19.h, z1.h\n"
+      "fmin z20.h, p4/M, z20.h, z1.h\n"
+      "fmin z21.h, p4/M, z21.h, z1.h\n"
+      "fmin z22.h, p4/M, z22.h, z1.h\n"
+      "fmin z23.h, p4/M, z23.h, z1.h\n"
+      "fmin z24.h, p4/M, z24.h, z1.h\n"
+      "fmin z25.h, p4/M, z25.h, z1.h\n"
+      "fmin z26.h, p4/M, z26.h, z1.h\n"
+      "fmin z27.h, p4/M, z27.h, z1.h\n"
+      "fmin z28.h, p4/M, z28.h, z1.h\n"
+      "fmin z29.h, p4/M, z29.h, z1.h\n"
+      "fmin z30.h, p4/M, z30.h, z1.h\n"
+      "fmin z31.h, p4/M, z31.h, z1.h\n"
+      "fmax z8.h, p4/M, z8.h, z0.h\n"
+      "fmax z9.h, p4/M, z9.h, z0.h\n"
+      "fmax z10.h, p4/M, z10.h, z0.h\n"
+      "fmax z11.h, p4/M, z11.h, z0.h\n"
+      "fmax z12.h, p4/M, z12.h, z0.h\n"
+      "fmax z13.h, p4/M, z13.h, z0.h\n"
+      "fmax z14.h, p4/M, z14.h, z0.h\n"
+      "fmax z15.h, p4/M, z15.h, z0.h\n"
+      "fmax z16.h, p4/M, z16.h, z0.h\n"
+      "fmax z17.h, p4/M, z17.h, z0.h\n"
+      "fmax z18.h, p4/M, z18.h, z0.h\n"
+      "fmax z19.h, p4/M, z19.h, z0.h\n"
+      "fmax z20.h, p4/M, z20.h, z0.h\n"
+      "fmax z21.h, p4/M, z21.h, z0.h\n"
+      "fmax z22.h, p4/M, z22.h, z0.h\n"
+      "fmax z23.h, p4/M, z23.h, z0.h\n"
+      "fmax z24.h, p4/M, z24.h, z0.h\n"
+      "fmax z25.h, p4/M, z25.h, z0.h\n"
+      "fmax z26.h, p4/M, z26.h, z0.h\n"
+      "fmax z27.h, p4/M, z27.h, z0.h\n"
+      "fmax z28.h, p4/M, z28.h, z0.h\n"
+      "fmax z29.h, p4/M, z29.h, z0.h\n"
+      "fmax z30.h, p4/M, z30.h, z0.h\n"
+      "fmax z31.h, p4/M, z31.h, z0.h\n"
+      "71:"  // Height 6: No activation
+      "st1h { z8.h }, p3, [x28]\n"
+      "st1h { z9.h }, p2, [x28, #1, MUL VL]\n"
+      "st1h { z10.h }, p1, [x28, #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1h { z12.h }, p3, [x24]\n"
+      "st1h { z13.h }, p2, [x24, #1, MUL VL]\n"
+      "st1h { z14.h }, p1, [x24, #2, MUL VL]\n"
+      "st1h { z15.h }, p0, [x24, #3, MUL VL]\n"
+      "st1h { z16.h }, p3, [x23]\n"
+      "st1h { z17.h }, p2, [x23, #1, MUL VL]\n"
+      "st1h { z18.h }, p1, [x23, #2, MUL VL]\n"
+      "st1h { z19.h }, p0, [x23, #3, MUL VL]\n"
+      "st1h { z20.h }, p3, [x22]\n"
+      "st1h { z21.h }, p2, [x22, #1, MUL VL]\n"
+      "st1h { z22.h }, p1, [x22, #2, MUL VL]\n"
+      "st1h { z23.h }, p0, [x22, #3, MUL VL]\n"
+      "st1h { z24.h }, p3, [x21]\n"
+      "st1h { z25.h }, p2, [x21, #1, MUL VL]\n"
+      "st1h { z26.h }, p1, [x21, #2, MUL VL]\n"
+      "st1h { z27.h }, p0, [x21, #3, MUL VL]\n"
+      "st1h { z28.h }, p3, [x20]\n"
+      "st1h { z29.h }, p2, [x20, #1, MUL VL]\n"
+      "st1h { z30.h }, p1, [x20, #2, MUL VL]\n"
+      "st1h { z31.h }, p0, [x20, #3, MUL VL]\n"
+      "72:"  // Height 6: Writeback done
+      "dech x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 62b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 74f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 73f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "73:"  // Update direct input
+      "mov x19, #0xc\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "74:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
index c151179a1f..09d5d8d96d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
@@ -161,13 +161,12 @@ void sve_hybrid_fp16_mla_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "fmla z8.h, z6.h, z0.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x8\n"
       "fmla z9.h, z7.h, z0.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "cmp x26, #0x8\n"
+      "add x25, x25, #0x10\n"
       "fmla z10.h, z6.h, z0.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla z11.h, z7.h, z0.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #5, MUL VL]\n"
       "fmla z8.h, z6.h, z0.h[1]\n"
@@ -235,7 +234,6 @@ void sve_hybrid_fp16_mla_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "fmla z8.h, z6.h, z0.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
       "fmla z9.h, z7.h, z0.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
@@ -318,9 +316,8 @@ void sve_hybrid_fp16_mla_6x4VL (
       "fmla z10.h, z6.h, z0.h[7]\n"
       "fmla z11.h, z7.h, z0.h[7]\n"
       "11:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 6b\n"
       "tbz %x[flags], #1, 12f\n"
@@ -424,16 +421,14 @@ void sve_hybrid_fp16_mla_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "fmla z8.h, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x8\n"
       "fmla z9.h, z7.h, z0.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       "fmla z12.h, z6.h, z1.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "cmp x26, #0x8\n"
       "fmla z13.h, z7.h, z1.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla z10.h, z6.h, z0.h[0]\n"
       "fmla z14.h, z6.h, z1.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #4, MUL VL]\n"
@@ -533,9 +528,7 @@ void sve_hybrid_fp16_mla_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "fmla z8.h, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "fmla z9.h, z7.h, z0.h[0]\n"
-      "add x24, x24, #0x10\n"
       "fmla z12.h, z6.h, z1.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
       "fmla z13.h, z7.h, z1.h[0]\n"
@@ -650,10 +643,8 @@ void sve_hybrid_fp16_mla_6x4VL (
       "fmla z11.h, z7.h, z0.h[7]\n"
       "fmla z15.h, z7.h, z1.h[7]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 19b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -787,21 +778,18 @@ void sve_hybrid_fp16_mla_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "fmla z8.h, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x8\n"
       "fmla z9.h, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "fmla z12.h, z6.h, z1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.h, z7.h, z1.h[0]\n"
       "add x23, x23, #0x10\n"
       "fmla z16.h, z6.h, z2.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "cmp x26, #0x8\n"
-      "fmla z13.h, z7.h, z1.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla z17.h, z7.h, z2.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla z10.h, z6.h, z0.h[0]\n"
       "fmla z14.h, z6.h, z1.h[0]\n"
       "fmla z18.h, z6.h, z2.h[0]\n"
@@ -931,12 +919,9 @@ void sve_hybrid_fp16_mla_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "fmla z8.h, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "fmla z9.h, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
       "fmla z12.h, z6.h, z1.h[0]\n"
-      "add x23, x23, #0x10\n"
       "fmla z13.h, z7.h, z1.h[0]\n"
       "fmla z16.h, z6.h, z2.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
@@ -1082,11 +1067,8 @@ void sve_hybrid_fp16_mla_6x4VL (
       "fmla z15.h, z7.h, z1.h[7]\n"
       "fmla z19.h, z7.h, z2.h[7]\n"
       "37:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 32b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1249,26 +1231,22 @@ void sve_hybrid_fp16_mla_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "fmla z8.h, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x8\n"
       "fmla z9.h, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "fmla z12.h, z6.h, z1.h[0]\n"
       "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla z16.h, z6.h, z2.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla z13.h, z7.h, z1.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x8\n"
+      "add x22, x22, #0x10\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
       "fmla z20.h, z6.h, z3.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla z21.h, z7.h, z3.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla z10.h, z6.h, z0.h[0]\n"
       "fmla z14.h, z6.h, z1.h[0]\n"
       "fmla z18.h, z6.h, z2.h[0]\n"
@@ -1428,19 +1406,15 @@ void sve_hybrid_fp16_mla_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "fmla z8.h, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "fmla z9.h, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
       "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
-      "add x22, x22, #0x10\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
       "fmla z13.h, z7.h, z1.h[0]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
       "fmla z20.h, z6.h, z3.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
       "fmla z21.h, z7.h, z3.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
@@ -1613,12 +1587,8 @@ void sve_hybrid_fp16_mla_6x4VL (
       "fmla z19.h, z7.h, z2.h[7]\n"
       "fmla z23.h, z7.h, z3.h[7]\n"
       "50:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 45b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1810,32 +1780,27 @@ void sve_hybrid_fp16_mla_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "fmla z8.h, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x8\n"
       "fmla z9.h, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "fmla z12.h, z6.h, z1.h[0]\n"
       "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla z16.h, z6.h, z2.h[0]\n"
       "ld1rqh { z4.h }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla z13.h, z7.h, z1.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
       "add x21, x21, #0x10\n"
       "fmla z20.h, z6.h, z3.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x8\n"
       "fmla z24.h, z6.h, z4.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla z21.h, z7.h, z3.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla z25.h, z7.h, z4.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
       "fmla z10.h, z6.h, z0.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "fmla z14.h, z6.h, z1.h[0]\n"
       "fmla z18.h, z6.h, z2.h[0]\n"
       "fmla z22.h, z6.h, z3.h[0]\n"
@@ -2024,22 +1989,17 @@ void sve_hybrid_fp16_mla_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "fmla z8.h, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "fmla z9.h, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
       "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
       "ld1rqh { z4.h }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
       "fmla z13.h, z7.h, z1.h[0]\n"
-      "add x21, x21, #0x10\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
       "fmla z20.h, z6.h, z3.h[0]\n"
       "fmla z24.h, z6.h, z4.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
       "fmla z21.h, z7.h, z3.h[0]\n"
       "fmla z25.h, z7.h, z4.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
@@ -2243,13 +2203,8 @@ void sve_hybrid_fp16_mla_6x4VL (
       "fmla z23.h, z7.h, z3.h[7]\n"
       "fmla z27.h, z7.h, z4.h[7]\n"
       "63:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 58b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -2473,37 +2428,31 @@ void sve_hybrid_fp16_mla_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "fmla z8.h, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x8\n"
       "fmla z9.h, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "fmla z12.h, z6.h, z1.h[0]\n"
       "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla z16.h, z6.h, z2.h[0]\n"
       "ld1rqh { z4.h }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla z13.h, z7.h, z1.h[0]\n"
       "ld1rqh { z5.h }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       "fmla z20.h, z6.h, z3.h[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
       "add x20, x20, #0x10\n"
       "fmla z24.h, z6.h, z4.h[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x8\n"
       "fmla z28.h, z6.h, z5.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla z21.h, z7.h, z3.h[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla z25.h, z7.h, z4.h[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "fmla z29.h, z7.h, z5.h[0]\n"
       "ld1h { z7.h }, p5/Z, [x10, #3, MUL VL]\n"
       "fmla z10.h, z6.h, z0.h[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "fmla z14.h, z6.h, z1.h[0]\n"
       "fmla z18.h, z6.h, z2.h[0]\n"
       "fmla z22.h, z6.h, z3.h[0]\n"
@@ -2722,25 +2671,19 @@ void sve_hybrid_fp16_mla_6x4VL (
       "ld1rqh { z0.h }, p0/Z, [x25]\n"
       "fmla z8.h, z6.h, z0.h[0]\n"
       "ld1rqh { z1.h }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "fmla z9.h, z7.h, z0.h[0]\n"
       "ld1rqh { z2.h }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.h, z6.h, z1.h[0]\n"
       "ld1rqh { z3.h }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.h, z6.h, z2.h[0]\n"
+      "fmla z12.h, z6.h, z1.h[0]\n"
       "ld1rqh { z4.h }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
       "fmla z13.h, z7.h, z1.h[0]\n"
       "ld1rqh { z5.h }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      "fmla z16.h, z6.h, z2.h[0]\n"
       "fmla z20.h, z6.h, z3.h[0]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z17.h, z7.h, z2.h[0]\n"
       "fmla z24.h, z6.h, z4.h[0]\n"
       "fmla z28.h, z6.h, z5.h[0]\n"
       "ld1h { z6.h }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z17.h, z7.h, z2.h[0]\n"
       "fmla z21.h, z7.h, z3.h[0]\n"
       "fmla z25.h, z7.h, z4.h[0]\n"
       "fmla z29.h, z7.h, z5.h[0]\n"
@@ -2975,14 +2918,8 @@ void sve_hybrid_fp16_mla_6x4VL (
       "fmla z27.h, z7.h, z4.h[7]\n"
       "fmla z31.h, z7.h, z5.h[7]\n"
       "76:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 71b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
index 4c0a3a11e0..1c140e0c02 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
@@ -22,9 +22,10 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#ifdef ARM_COMPUTE_ENABLE_SVE
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -38,11 +39,13 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void sve_hybrid_fp32_mla_6x4VL( ARGLIST );
+void sve_hybrid_fp32_mla_6x4VL_a64fx( ARGLIST );
 
 class cls_sve_hybrid_fp32_mla_6x4VL
 {
 public:
-    typedef float operand_type;
+    typedef float lhs_operand_type;
+    typedef float rhs_operand_type;
     typedef float result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,16 +71,37 @@ public:
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 6, 4, 1> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 1> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 6.667 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_fp32_mla_6x4VL;
-    cls_sve_hybrid_fp32_mla_6x4VL(const CPUInfo *)
+    cls_sve_hybrid_fp32_mla_6x4VL(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_hybrid_fp32_mla_6x4VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp
new file mode 100644
index 0000000000..30b6a54277
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/a64fx.cpp
@@ -0,0 +1,1366 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_6x4VL_a64fx (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p4.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 61f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 49f\n"
+      "beq 37f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 25f\n"
+      "beq 13f\n"
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "cbz x11, 3f\n"
+      "ld1w { z8.s }, p4/Z, [x11]\n"
+      "ld1w { z9.s }, p4/Z, [x11, #1, MUL VL]\n"
+      "ld1w { z10.s }, p4/Z, [x11, #2, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "b 5f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 8f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "subs x26, x26, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1w { z6.s }, p4/Z, [x9]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "add x25, x25, #0x4\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "subs x26, x26, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1w { z6.s }, p4/Z, [x9]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "addvl x9, x9, #4\n"
+      "bne 6b\n"
+      "tbz %x[flags], #1, 11f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p4/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p4/Z, [x19]\n"
+      "fmin z8.s, p4/M, z8.s, z1.s\n"
+      "fmin z9.s, p4/M, z9.s, z1.s\n"
+      "fmin z10.s, p4/M, z10.s, z1.s\n"
+      "fmin z11.s, p4/M, z11.s, z1.s\n"
+      "fmax z8.s, p4/M, z8.s, z0.s\n"
+      "fmax z9.s, p4/M, z9.s, z0.s\n"
+      "fmax z10.s, p4/M, z10.s, z0.s\n"
+      "fmax z11.s, p4/M, z11.s, z0.s\n"
+      "11:"  // Height 1: No activation
+      "st1w { z8.s }, p3, [x28]\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "12:"  // Height 1: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 2b\n"
+      "b 74f\n"
+      "13:"  // Height 2
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "14:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "cbz x11, 15f\n"
+      "ld1w { z8.s }, p4/Z, [x11]\n"
+      "ld1w { z9.s }, p4/Z, [x11, #1, MUL VL]\n"
+      "ld1w { z10.s }, p4/Z, [x11, #2, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z11.s }, p4/Z, [x11, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x11, x11, #4\n"
+      "b 17f\n"
+      "15:"  // Height 2: no bias
+      "tbz %x[flags], #0, 16f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x24]\n"
+      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "b 17f\n"
+      "16:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 20f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "20:"  // Height 2: input setup done
+      "subs x26, x26, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1w { z6.s }, p4/Z, [x9]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 22f\n"
+      "21:"  // Height 2: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x25, x25, #0x4\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "subs x26, x26, #0x1\n"
+      "add x24, x24, #0x4\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1w { z6.s }, p4/Z, [x9]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 21b\n"
+      "22:"  // Height 2: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "addvl x9, x9, #4\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "bne 18b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "tbz %x[flags], #1, 23f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p4/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p4/Z, [x19]\n"
+      "fmin z8.s, p4/M, z8.s, z1.s\n"
+      "fmin z9.s, p4/M, z9.s, z1.s\n"
+      "fmin z10.s, p4/M, z10.s, z1.s\n"
+      "fmin z11.s, p4/M, z11.s, z1.s\n"
+      "fmin z12.s, p4/M, z12.s, z1.s\n"
+      "fmin z13.s, p4/M, z13.s, z1.s\n"
+      "fmin z14.s, p4/M, z14.s, z1.s\n"
+      "fmin z15.s, p4/M, z15.s, z1.s\n"
+      "fmax z8.s, p4/M, z8.s, z0.s\n"
+      "fmax z9.s, p4/M, z9.s, z0.s\n"
+      "fmax z10.s, p4/M, z10.s, z0.s\n"
+      "fmax z11.s, p4/M, z11.s, z0.s\n"
+      "fmax z12.s, p4/M, z12.s, z0.s\n"
+      "fmax z13.s, p4/M, z13.s, z0.s\n"
+      "fmax z14.s, p4/M, z14.s, z0.s\n"
+      "fmax z15.s, p4/M, z15.s, z0.s\n"
+      "23:"  // Height 2: No activation
+      "st1w { z8.s }, p3, [x28]\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x24]\n"
+      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
+      "24:"  // Height 2: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 14b\n"
+      "b 74f\n"
+      "25:"  // Height 3
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "26:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "cbz x11, 27f\n"
+      "ld1w { z8.s }, p4/Z, [x11]\n"
+      "ld1w { z9.s }, p4/Z, [x11, #1, MUL VL]\n"
+      "ld1w { z10.s }, p4/Z, [x11, #2, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z11.s }, p4/Z, [x11, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x11, x11, #4\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "b 29f\n"
+      "27:"  // Height 3: no bias
+      "tbz %x[flags], #0, 28f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x24]\n"
+      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23]\n"
+      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "b 29f\n"
+      "28:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "29:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "30:"  // Height 3: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 31f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 32f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 32f\n"
+      "31:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "32:"  // Height 3: input setup done
+      "subs x26, x26, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1w { z6.s }, p4/Z, [x9]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 34f\n"
+      "33:"  // Height 3: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x25, x25, #0x4\n"
+      "subs x26, x26, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x24, x24, #0x4\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "add x23, x23, #0x4\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z18.s, p4/M, z6.s, z2.s\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1w { z6.s }, p4/Z, [x9]\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 33b\n"
+      "34:"  // Height 3: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x27, x27, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n"
+      "cmp x27, x19\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z18.s, p4/M, z6.s, z2.s\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "bne 30b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "tbz %x[flags], #1, 35f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p4/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p4/Z, [x19]\n"
+      "fmin z8.s, p4/M, z8.s, z1.s\n"
+      "fmin z9.s, p4/M, z9.s, z1.s\n"
+      "fmin z10.s, p4/M, z10.s, z1.s\n"
+      "fmin z11.s, p4/M, z11.s, z1.s\n"
+      "fmin z12.s, p4/M, z12.s, z1.s\n"
+      "fmin z13.s, p4/M, z13.s, z1.s\n"
+      "fmin z14.s, p4/M, z14.s, z1.s\n"
+      "fmin z15.s, p4/M, z15.s, z1.s\n"
+      "fmin z16.s, p4/M, z16.s, z1.s\n"
+      "fmin z17.s, p4/M, z17.s, z1.s\n"
+      "fmin z18.s, p4/M, z18.s, z1.s\n"
+      "fmin z19.s, p4/M, z19.s, z1.s\n"
+      "fmax z8.s, p4/M, z8.s, z0.s\n"
+      "fmax z9.s, p4/M, z9.s, z0.s\n"
+      "fmax z10.s, p4/M, z10.s, z0.s\n"
+      "fmax z11.s, p4/M, z11.s, z0.s\n"
+      "fmax z12.s, p4/M, z12.s, z0.s\n"
+      "fmax z13.s, p4/M, z13.s, z0.s\n"
+      "fmax z14.s, p4/M, z14.s, z0.s\n"
+      "fmax z15.s, p4/M, z15.s, z0.s\n"
+      "fmax z16.s, p4/M, z16.s, z0.s\n"
+      "fmax z17.s, p4/M, z17.s, z0.s\n"
+      "fmax z18.s, p4/M, z18.s, z0.s\n"
+      "fmax z19.s, p4/M, z19.s, z0.s\n"
+      "35:"  // Height 3: No activation
+      "st1w { z8.s }, p3, [x28]\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x24]\n"
+      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x23]\n"
+      "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
+      "36:"  // Height 3: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 26b\n"
+      "b 74f\n"
+      "37:"  // Height 4
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "38:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "cbz x11, 39f\n"
+      "ld1w { z8.s }, p4/Z, [x11]\n"
+      "ld1w { z9.s }, p4/Z, [x11, #1, MUL VL]\n"
+      "ld1w { z10.s }, p4/Z, [x11, #2, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z11.s }, p4/Z, [x11, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x11, x11, #4\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "b 41f\n"
+      "39:"  // Height 4: no bias
+      "tbz %x[flags], #0, 40f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x24]\n"
+      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23]\n"
+      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x22]\n"
+      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "b 41f\n"
+      "40:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "41:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "42:"  // Height 4: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 43f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 44f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 44f\n"
+      "43:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "44:"  // Height 4: input setup done
+      "subs x26, x26, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1w { z6.s }, p4/Z, [x9]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 46f\n"
+      "45:"  // Height 4: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x25, x25, #0x4\n"
+      "subs x26, x26, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x24, x24, #0x4\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "add x23, x23, #0x4\n"
+      "add x22, x22, #0x4\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z18.s, p4/M, z6.s, z2.s\n"
+      "fmla z22.s, p4/M, z6.s, z3.s\n"
+      "ld1w { z6.s }, p4/Z, [x9]\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 45b\n"
+      "46:"  // Height 4: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x27, x27, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n"
+      "cmp x27, x19\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z18.s, p4/M, z6.s, z2.s\n"
+      "fmla z22.s, p4/M, z6.s, z3.s\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "bne 42b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "tbz %x[flags], #1, 47f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p4/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p4/Z, [x19]\n"
+      "fmin z8.s, p4/M, z8.s, z1.s\n"
+      "fmin z9.s, p4/M, z9.s, z1.s\n"
+      "fmin z10.s, p4/M, z10.s, z1.s\n"
+      "fmin z11.s, p4/M, z11.s, z1.s\n"
+      "fmin z12.s, p4/M, z12.s, z1.s\n"
+      "fmin z13.s, p4/M, z13.s, z1.s\n"
+      "fmin z14.s, p4/M, z14.s, z1.s\n"
+      "fmin z15.s, p4/M, z15.s, z1.s\n"
+      "fmin z16.s, p4/M, z16.s, z1.s\n"
+      "fmin z17.s, p4/M, z17.s, z1.s\n"
+      "fmin z18.s, p4/M, z18.s, z1.s\n"
+      "fmin z19.s, p4/M, z19.s, z1.s\n"
+      "fmin z20.s, p4/M, z20.s, z1.s\n"
+      "fmin z21.s, p4/M, z21.s, z1.s\n"
+      "fmin z22.s, p4/M, z22.s, z1.s\n"
+      "fmin z23.s, p4/M, z23.s, z1.s\n"
+      "fmax z8.s, p4/M, z8.s, z0.s\n"
+      "fmax z9.s, p4/M, z9.s, z0.s\n"
+      "fmax z10.s, p4/M, z10.s, z0.s\n"
+      "fmax z11.s, p4/M, z11.s, z0.s\n"
+      "fmax z12.s, p4/M, z12.s, z0.s\n"
+      "fmax z13.s, p4/M, z13.s, z0.s\n"
+      "fmax z14.s, p4/M, z14.s, z0.s\n"
+      "fmax z15.s, p4/M, z15.s, z0.s\n"
+      "fmax z16.s, p4/M, z16.s, z0.s\n"
+      "fmax z17.s, p4/M, z17.s, z0.s\n"
+      "fmax z18.s, p4/M, z18.s, z0.s\n"
+      "fmax z19.s, p4/M, z19.s, z0.s\n"
+      "fmax z20.s, p4/M, z20.s, z0.s\n"
+      "fmax z21.s, p4/M, z21.s, z0.s\n"
+      "fmax z22.s, p4/M, z22.s, z0.s\n"
+      "fmax z23.s, p4/M, z23.s, z0.s\n"
+      "47:"  // Height 4: No activation
+      "st1w { z8.s }, p3, [x28]\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x24]\n"
+      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x23]\n"
+      "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x22]\n"
+      "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
+      "48:"  // Height 4: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 38b\n"
+      "b 74f\n"
+      "49:"  // Height 5
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "50:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "cbz x11, 51f\n"
+      "ld1w { z8.s }, p4/Z, [x11]\n"
+      "ld1w { z9.s }, p4/Z, [x11, #1, MUL VL]\n"
+      "ld1w { z10.s }, p4/Z, [x11, #2, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z11.s }, p4/Z, [x11, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x11, x11, #4\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "b 53f\n"
+      "51:"  // Height 5: no bias
+      "tbz %x[flags], #0, 52f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x24]\n"
+      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23]\n"
+      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x22]\n"
+      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x21]\n"
+      "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "b 53f\n"
+      "52:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "53:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "54:"  // Height 5: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 55f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 56f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 56f\n"
+      "55:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "56:"  // Height 5: input setup done
+      "subs x26, x26, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1rw { z4.s }, p4/Z, [x21]\n"
+      "ld1w { z6.s }, p4/Z, [x9]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 58f\n"
+      "57:"  // Height 5: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x25, x25, #0x4\n"
+      "subs x26, x26, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "fmla z24.s, p4/M, z6.s, z4.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x22, x22, #0x4\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "add x21, x21, #0x4\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "fmla z25.s, p4/M, z7.s, z4.s\n"
+      "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z18.s, p4/M, z6.s, z2.s\n"
+      "fmla z22.s, p4/M, z6.s, z3.s\n"
+      "fmla z26.s, p4/M, z6.s, z4.s\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1w { z6.s }, p4/Z, [x9]\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "fmla z27.s, p4/M, z7.s, z4.s\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1rw { z4.s }, p4/Z, [x21]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 57b\n"
+      "58:"  // Height 5: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x27, x27, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "cmp x27, x19\n"
+      "fmla z24.s, p4/M, z6.s, z4.s\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "fmla z25.s, p4/M, z7.s, z4.s\n"
+      "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z18.s, p4/M, z6.s, z2.s\n"
+      "fmla z22.s, p4/M, z6.s, z3.s\n"
+      "fmla z26.s, p4/M, z6.s, z4.s\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "fmla z27.s, p4/M, z7.s, z4.s\n"
+      "bne 54b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "tbz %x[flags], #1, 59f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p4/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p4/Z, [x19]\n"
+      "fmin z8.s, p4/M, z8.s, z1.s\n"
+      "fmin z9.s, p4/M, z9.s, z1.s\n"
+      "fmin z10.s, p4/M, z10.s, z1.s\n"
+      "fmin z11.s, p4/M, z11.s, z1.s\n"
+      "fmin z12.s, p4/M, z12.s, z1.s\n"
+      "fmin z13.s, p4/M, z13.s, z1.s\n"
+      "fmin z14.s, p4/M, z14.s, z1.s\n"
+      "fmin z15.s, p4/M, z15.s, z1.s\n"
+      "fmin z16.s, p4/M, z16.s, z1.s\n"
+      "fmin z17.s, p4/M, z17.s, z1.s\n"
+      "fmin z18.s, p4/M, z18.s, z1.s\n"
+      "fmin z19.s, p4/M, z19.s, z1.s\n"
+      "fmin z20.s, p4/M, z20.s, z1.s\n"
+      "fmin z21.s, p4/M, z21.s, z1.s\n"
+      "fmin z22.s, p4/M, z22.s, z1.s\n"
+      "fmin z23.s, p4/M, z23.s, z1.s\n"
+      "fmin z24.s, p4/M, z24.s, z1.s\n"
+      "fmin z25.s, p4/M, z25.s, z1.s\n"
+      "fmin z26.s, p4/M, z26.s, z1.s\n"
+      "fmin z27.s, p4/M, z27.s, z1.s\n"
+      "fmax z8.s, p4/M, z8.s, z0.s\n"
+      "fmax z9.s, p4/M, z9.s, z0.s\n"
+      "fmax z10.s, p4/M, z10.s, z0.s\n"
+      "fmax z11.s, p4/M, z11.s, z0.s\n"
+      "fmax z12.s, p4/M, z12.s, z0.s\n"
+      "fmax z13.s, p4/M, z13.s, z0.s\n"
+      "fmax z14.s, p4/M, z14.s, z0.s\n"
+      "fmax z15.s, p4/M, z15.s, z0.s\n"
+      "fmax z16.s, p4/M, z16.s, z0.s\n"
+      "fmax z17.s, p4/M, z17.s, z0.s\n"
+      "fmax z18.s, p4/M, z18.s, z0.s\n"
+      "fmax z19.s, p4/M, z19.s, z0.s\n"
+      "fmax z20.s, p4/M, z20.s, z0.s\n"
+      "fmax z21.s, p4/M, z21.s, z0.s\n"
+      "fmax z22.s, p4/M, z22.s, z0.s\n"
+      "fmax z23.s, p4/M, z23.s, z0.s\n"
+      "fmax z24.s, p4/M, z24.s, z0.s\n"
+      "fmax z25.s, p4/M, z25.s, z0.s\n"
+      "fmax z26.s, p4/M, z26.s, z0.s\n"
+      "fmax z27.s, p4/M, z27.s, z0.s\n"
+      "59:"  // Height 5: No activation
+      "st1w { z8.s }, p3, [x28]\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x24]\n"
+      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x23]\n"
+      "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x22]\n"
+      "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x21]\n"
+      "st1w { z25.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x21, #3, MUL VL]\n"
+      "60:"  // Height 5: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 50b\n"
+      "b 74f\n"
+      "61:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x19, #0x18\n"
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "62:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "cbz x11, 63f\n"
+      "ld1w { z8.s }, p4/Z, [x11]\n"
+      "ld1w { z9.s }, p4/Z, [x11, #1, MUL VL]\n"
+      "ld1w { z10.s }, p4/Z, [x11, #2, MUL VL]\n"
+      "mov z12.d, z8.d\n"
+      "mov z13.d, z9.d\n"
+      "ld1w { z11.s }, p4/Z, [x11, #3, MUL VL]\n"
+      "mov z14.d, z10.d\n"
+      "mov z15.d, z11.d\n"
+      "addvl x11, x11, #4\n"
+      "mov z16.d, z8.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z28.d, z8.d\n"
+      "mov z29.d, z9.d\n"
+      "mov z30.d, z10.d\n"
+      "mov z31.d, z11.d\n"
+      "b 65f\n"
+      "63:"  // Height 6: no bias
+      "tbz %x[flags], #0, 64f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x24]\n"
+      "ld1w { z13.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23]\n"
+      "ld1w { z17.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x22]\n"
+      "ld1w { z21.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x21]\n"
+      "ld1w { z25.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z28.s }, p3/Z, [x20]\n"
+      "ld1w { z29.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z30.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 65f\n"
+      "64:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "65:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "66:"  // Height 6: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 68f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 68f\n"
+      "67:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "68:"  // Height 6: input setup done
+      "subs x26, x26, #0x1\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1rw { z4.s }, p4/Z, [x21]\n"
+      "ld1rw { z5.s }, p4/Z, [x20]\n"
+      "ld1w { z6.s }, p4/Z, [x9]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 70f\n"
+      "69:"  // Height 6: Multiply loop: Main loop
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x25, x25, #0x4\n"
+      "subs x26, x26, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "fmla z24.s, p4/M, z6.s, z4.s\n"
+      "fmla z28.s, p4/M, z6.s, z5.s\n"
+      "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x22, x22, #0x4\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "add x21, x21, #0x4\n"
+      "add x20, x20, #0x4\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "fmla z25.s, p4/M, z7.s, z4.s\n"
+      "fmla z29.s, p4/M, z7.s, z5.s\n"
+      "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z18.s, p4/M, z6.s, z2.s\n"
+      "fmla z22.s, p4/M, z6.s, z3.s\n"
+      "fmla z26.s, p4/M, z6.s, z4.s\n"
+      "fmla z30.s, p4/M, z6.s, z5.s\n"
+      "ld1w { z6.s }, p4/Z, [x9]\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "fmla z27.s, p4/M, z7.s, z4.s\n"
+      "fmla z31.s, p4/M, z7.s, z5.s\n"
+      "ld1rw { z4.s }, p4/Z, [x21]\n"
+      "ld1rw { z5.s }, p4/Z, [x20]\n"
+      "ld1w { z7.s }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 69b\n"
+      "70:"  // Height 6: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "fmla z8.s, p4/M, z6.s, z0.s\n"
+      "fmla z12.s, p4/M, z6.s, z1.s\n"
+      "add x27, x27, #0x1\n"
+      "fmla z16.s, p4/M, z6.s, z2.s\n"
+      "fmla z20.s, p4/M, z6.s, z3.s\n"
+      "cmp x27, x19\n"
+      "fmla z24.s, p4/M, z6.s, z4.s\n"
+      "fmla z28.s, p4/M, z6.s, z5.s\n"
+      "ld1w { z6.s }, p4/Z, [x9, #2, MUL VL]\n"
+      "fmla z9.s, p4/M, z7.s, z0.s\n"
+      "fmla z13.s, p4/M, z7.s, z1.s\n"
+      "fmla z17.s, p4/M, z7.s, z2.s\n"
+      "fmla z21.s, p4/M, z7.s, z3.s\n"
+      "fmla z25.s, p4/M, z7.s, z4.s\n"
+      "fmla z29.s, p4/M, z7.s, z5.s\n"
+      "ld1w { z7.s }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "fmla z10.s, p4/M, z6.s, z0.s\n"
+      "fmla z14.s, p4/M, z6.s, z1.s\n"
+      "fmla z18.s, p4/M, z6.s, z2.s\n"
+      "fmla z22.s, p4/M, z6.s, z3.s\n"
+      "fmla z26.s, p4/M, z6.s, z4.s\n"
+      "fmla z30.s, p4/M, z6.s, z5.s\n"
+      "fmla z11.s, p4/M, z7.s, z0.s\n"
+      "fmla z15.s, p4/M, z7.s, z1.s\n"
+      "fmla z19.s, p4/M, z7.s, z2.s\n"
+      "fmla z23.s, p4/M, z7.s, z3.s\n"
+      "fmla z27.s, p4/M, z7.s, z4.s\n"
+      "fmla z31.s, p4/M, z7.s, z5.s\n"
+      "bne 66b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "tbz %x[flags], #1, 71f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p4/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p4/Z, [x19]\n"
+      "fmin z8.s, p4/M, z8.s, z1.s\n"
+      "fmin z9.s, p4/M, z9.s, z1.s\n"
+      "fmin z10.s, p4/M, z10.s, z1.s\n"
+      "fmin z11.s, p4/M, z11.s, z1.s\n"
+      "fmin z12.s, p4/M, z12.s, z1.s\n"
+      "fmin z13.s, p4/M, z13.s, z1.s\n"
+      "fmin z14.s, p4/M, z14.s, z1.s\n"
+      "fmin z15.s, p4/M, z15.s, z1.s\n"
+      "fmin z16.s, p4/M, z16.s, z1.s\n"
+      "fmin z17.s, p4/M, z17.s, z1.s\n"
+      "fmin z18.s, p4/M, z18.s, z1.s\n"
+      "fmin z19.s, p4/M, z19.s, z1.s\n"
+      "fmin z20.s, p4/M, z20.s, z1.s\n"
+      "fmin z21.s, p4/M, z21.s, z1.s\n"
+      "fmin z22.s, p4/M, z22.s, z1.s\n"
+      "fmin z23.s, p4/M, z23.s, z1.s\n"
+      "fmin z24.s, p4/M, z24.s, z1.s\n"
+      "fmin z25.s, p4/M, z25.s, z1.s\n"
+      "fmin z26.s, p4/M, z26.s, z1.s\n"
+      "fmin z27.s, p4/M, z27.s, z1.s\n"
+      "fmin z28.s, p4/M, z28.s, z1.s\n"
+      "fmin z29.s, p4/M, z29.s, z1.s\n"
+      "fmin z30.s, p4/M, z30.s, z1.s\n"
+      "fmin z31.s, p4/M, z31.s, z1.s\n"
+      "fmax z8.s, p4/M, z8.s, z0.s\n"
+      "fmax z9.s, p4/M, z9.s, z0.s\n"
+      "fmax z10.s, p4/M, z10.s, z0.s\n"
+      "fmax z11.s, p4/M, z11.s, z0.s\n"
+      "fmax z12.s, p4/M, z12.s, z0.s\n"
+      "fmax z13.s, p4/M, z13.s, z0.s\n"
+      "fmax z14.s, p4/M, z14.s, z0.s\n"
+      "fmax z15.s, p4/M, z15.s, z0.s\n"
+      "fmax z16.s, p4/M, z16.s, z0.s\n"
+      "fmax z17.s, p4/M, z17.s, z0.s\n"
+      "fmax z18.s, p4/M, z18.s, z0.s\n"
+      "fmax z19.s, p4/M, z19.s, z0.s\n"
+      "fmax z20.s, p4/M, z20.s, z0.s\n"
+      "fmax z21.s, p4/M, z21.s, z0.s\n"
+      "fmax z22.s, p4/M, z22.s, z0.s\n"
+      "fmax z23.s, p4/M, z23.s, z0.s\n"
+      "fmax z24.s, p4/M, z24.s, z0.s\n"
+      "fmax z25.s, p4/M, z25.s, z0.s\n"
+      "fmax z26.s, p4/M, z26.s, z0.s\n"
+      "fmax z27.s, p4/M, z27.s, z0.s\n"
+      "fmax z28.s, p4/M, z28.s, z0.s\n"
+      "fmax z29.s, p4/M, z29.s, z0.s\n"
+      "fmax z30.s, p4/M, z30.s, z0.s\n"
+      "fmax z31.s, p4/M, z31.s, z0.s\n"
+      "71:"  // Height 6: No activation
+      "st1w { z8.s }, p3, [x28]\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x24]\n"
+      "st1w { z13.s }, p2, [x24, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x23]\n"
+      "st1w { z17.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x22]\n"
+      "st1w { z21.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x21]\n"
+      "st1w { z25.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z28.s }, p3, [x20]\n"
+      "st1w { z29.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z30.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z31.s }, p0, [x20, #3, MUL VL]\n"
+      "72:"  // Height 6: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 62b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 74f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 73f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "73:"  // Update direct input
+      "mov x19, #0x18\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "74:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
index 25d65826b9..3baf7b9715 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
@@ -161,13 +161,12 @@ void sve_hybrid_fp32_mla_6x4VL (
       "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "fmla z8.s, z6.s, z0.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x4\n"
       "fmla z9.s, z7.s, z0.s[0]\n"
       "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "cmp x26, #0x4\n"
+      "add x25, x25, #0x10\n"
       "fmla z10.s, z6.s, z0.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla z11.s, z7.s, z0.s[0]\n"
       "ld1w { z7.s }, p5/Z, [x10, #5, MUL VL]\n"
       "fmla z8.s, z6.s, z0.s[1]\n"
@@ -202,7 +201,6 @@ void sve_hybrid_fp32_mla_6x4VL (
       "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "fmla z8.s, z6.s, z0.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
       "fmla z9.s, z7.s, z0.s[0]\n"
       "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
@@ -241,9 +239,8 @@ void sve_hybrid_fp32_mla_6x4VL (
       "fmla z10.s, z6.s, z0.s[3]\n"
       "fmla z11.s, z7.s, z0.s[3]\n"
       "11:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 6b\n"
       "tbz %x[flags], #1, 12f\n"
@@ -347,16 +344,14 @@ void sve_hybrid_fp32_mla_6x4VL (
       "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "fmla z8.s, z6.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x4\n"
       "fmla z9.s, z7.s, z0.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       "fmla z12.s, z6.s, z1.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "cmp x26, #0x4\n"
       "fmla z13.s, z7.s, z1.s[0]\n"
       "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla z10.s, z6.s, z0.s[0]\n"
       "fmla z14.s, z6.s, z1.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #4, MUL VL]\n"
@@ -407,9 +402,7 @@ void sve_hybrid_fp32_mla_6x4VL (
       "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "fmla z8.s, z6.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "fmla z9.s, z7.s, z0.s[0]\n"
-      "add x24, x24, #0x10\n"
       "fmla z12.s, z6.s, z1.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
       "fmla z13.s, z7.s, z1.s[0]\n"
@@ -464,10 +457,8 @@ void sve_hybrid_fp32_mla_6x4VL (
       "fmla z11.s, z7.s, z0.s[3]\n"
       "fmla z15.s, z7.s, z1.s[3]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 19b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -601,21 +592,18 @@ void sve_hybrid_fp32_mla_6x4VL (
       "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "fmla z8.s, z6.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x4\n"
       "fmla z9.s, z7.s, z0.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "fmla z12.s, z6.s, z1.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "fmla z13.s, z7.s, z1.s[0]\n"
       "add x23, x23, #0x10\n"
       "fmla z16.s, z6.s, z2.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "cmp x26, #0x4\n"
-      "fmla z13.s, z7.s, z1.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla z17.s, z7.s, z2.s[0]\n"
       "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla z10.s, z6.s, z0.s[0]\n"
       "fmla z14.s, z6.s, z1.s[0]\n"
       "fmla z18.s, z6.s, z2.s[0]\n"
@@ -680,12 +668,9 @@ void sve_hybrid_fp32_mla_6x4VL (
       "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "fmla z8.s, z6.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "fmla z9.s, z7.s, z0.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
       "fmla z12.s, z6.s, z1.s[0]\n"
-      "add x23, x23, #0x10\n"
       "fmla z13.s, z7.s, z1.s[0]\n"
       "fmla z16.s, z6.s, z2.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
@@ -755,11 +740,8 @@ void sve_hybrid_fp32_mla_6x4VL (
       "fmla z15.s, z7.s, z1.s[3]\n"
       "fmla z19.s, z7.s, z2.s[3]\n"
       "37:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 32b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -922,26 +904,22 @@ void sve_hybrid_fp32_mla_6x4VL (
       "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "fmla z8.s, z6.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x4\n"
       "fmla z9.s, z7.s, z0.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "fmla z12.s, z6.s, z1.s[0]\n"
       "ld1rqw { z3.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla z16.s, z6.s, z2.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla z13.s, z7.s, z1.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x4\n"
+      "add x22, x22, #0x10\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
       "fmla z20.s, z6.s, z3.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla z21.s, z7.s, z3.s[0]\n"
       "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla z10.s, z6.s, z0.s[0]\n"
       "fmla z14.s, z6.s, z1.s[0]\n"
       "fmla z18.s, z6.s, z2.s[0]\n"
@@ -1020,19 +998,15 @@ void sve_hybrid_fp32_mla_6x4VL (
       "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "fmla z8.s, z6.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "fmla z9.s, z7.s, z0.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
       "ld1rqw { z3.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
-      "add x22, x22, #0x10\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
       "fmla z13.s, z7.s, z1.s[0]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
       "fmla z20.s, z6.s, z3.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
       "fmla z21.s, z7.s, z3.s[0]\n"
       "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
       "addvl x10, x10, #4\n"
@@ -1113,12 +1087,8 @@ void sve_hybrid_fp32_mla_6x4VL (
       "fmla z19.s, z7.s, z2.s[3]\n"
       "fmla z23.s, z7.s, z3.s[3]\n"
       "50:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 45b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1310,32 +1280,27 @@ void sve_hybrid_fp32_mla_6x4VL (
       "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "fmla z8.s, z6.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x4\n"
       "fmla z9.s, z7.s, z0.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "fmla z12.s, z6.s, z1.s[0]\n"
       "ld1rqw { z3.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla z16.s, z6.s, z2.s[0]\n"
       "ld1rqw { z4.s }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla z13.s, z7.s, z1.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
       "add x21, x21, #0x10\n"
       "fmla z20.s, z6.s, z3.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x4\n"
       "fmla z24.s, z6.s, z4.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla z21.s, z7.s, z3.s[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla z25.s, z7.s, z4.s[0]\n"
       "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
       "fmla z10.s, z6.s, z0.s[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "fmla z14.s, z6.s, z1.s[0]\n"
       "fmla z18.s, z6.s, z2.s[0]\n"
       "fmla z22.s, z6.s, z3.s[0]\n"
@@ -1427,22 +1392,17 @@ void sve_hybrid_fp32_mla_6x4VL (
       "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "fmla z8.s, z6.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "fmla z9.s, z7.s, z0.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
       "ld1rqw { z3.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
       "ld1rqw { z4.s }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
       "fmla z13.s, z7.s, z1.s[0]\n"
-      "add x21, x21, #0x10\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
       "fmla z20.s, z6.s, z3.s[0]\n"
       "fmla z24.s, z6.s, z4.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
       "fmla z21.s, z7.s, z3.s[0]\n"
       "fmla z25.s, z7.s, z4.s[0]\n"
       "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
@@ -1538,13 +1498,8 @@ void sve_hybrid_fp32_mla_6x4VL (
       "fmla z23.s, z7.s, z3.s[3]\n"
       "fmla z27.s, z7.s, z4.s[3]\n"
       "63:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 58b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1768,37 +1723,31 @@ void sve_hybrid_fp32_mla_6x4VL (
       "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "fmla z8.s, z6.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x4\n"
       "fmla z9.s, z7.s, z0.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "fmla z12.s, z6.s, z1.s[0]\n"
       "ld1rqw { z3.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla z16.s, z6.s, z2.s[0]\n"
       "ld1rqw { z4.s }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla z13.s, z7.s, z1.s[0]\n"
       "ld1rqw { z5.s }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       "fmla z20.s, z6.s, z3.s[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
       "add x20, x20, #0x10\n"
       "fmla z24.s, z6.s, z4.s[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x4\n"
       "fmla z28.s, z6.s, z5.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla z21.s, z7.s, z3.s[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla z25.s, z7.s, z4.s[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "fmla z29.s, z7.s, z5.s[0]\n"
       "ld1w { z7.s }, p5/Z, [x10, #3, MUL VL]\n"
       "fmla z10.s, z6.s, z0.s[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "fmla z14.s, z6.s, z1.s[0]\n"
       "fmla z18.s, z6.s, z2.s[0]\n"
       "fmla z22.s, z6.s, z3.s[0]\n"
@@ -1904,25 +1853,19 @@ void sve_hybrid_fp32_mla_6x4VL (
       "ld1rqw { z0.s }, p0/Z, [x25]\n"
       "fmla z8.s, z6.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "fmla z9.s, z7.s, z0.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z12.s, z6.s, z1.s[0]\n"
       "ld1rqw { z3.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z16.s, z6.s, z2.s[0]\n"
+      "fmla z12.s, z6.s, z1.s[0]\n"
       "ld1rqw { z4.s }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
       "fmla z13.s, z7.s, z1.s[0]\n"
       "ld1rqw { z5.s }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      "fmla z16.s, z6.s, z2.s[0]\n"
       "fmla z20.s, z6.s, z3.s[0]\n"
-      "add x20, x20, #0x10\n"
-      "fmla z17.s, z7.s, z2.s[0]\n"
       "fmla z24.s, z6.s, z4.s[0]\n"
       "fmla z28.s, z6.s, z5.s[0]\n"
       "ld1w { z6.s }, p5/Z, [x10, #2, MUL VL]\n"
+      "fmla z17.s, z7.s, z2.s[0]\n"
       "fmla z21.s, z7.s, z3.s[0]\n"
       "fmla z25.s, z7.s, z4.s[0]\n"
       "fmla z29.s, z7.s, z5.s[0]\n"
@@ -2033,14 +1976,8 @@ void sve_hybrid_fp32_mla_6x4VL (
       "fmla z27.s, z7.s, z4.s[3]\n"
       "fmla z31.s, z7.s, z5.s[3]\n"
       "76:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 71b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
index 87f063d224..c0718b1e75 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
@@ -22,8 +22,8 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#ifdef ARM_COMPUTE_ENABLE_SVE
 #include "../std_transforms_sve.hpp"
 
 #define ARGLIST  \
@@ -38,11 +38,13 @@ namespace arm_gemm
 {
 // Actual kernel implementations
 void sve_hybrid_fp32_mla_8x1VL( ARGLIST );
+void sve_hybrid_fp32_mla_8x1VL_a64fx( ARGLIST );
 
 class cls_sve_hybrid_fp32_mla_8x1VL
 {
 public:
-    typedef float operand_type;
+    typedef float lhs_operand_type;
+    typedef float rhs_operand_type;
     typedef float result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,16 +70,24 @@ public:
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 8, 1, 1> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 8, 1, 1> transforms = {};
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_fp32_mla_8x1VL;
-    cls_sve_hybrid_fp32_mla_8x1VL(const CPUInfo *)
+    cls_sve_hybrid_fp32_mla_8x1VL(const CPUInfo *ci)
     {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_hybrid_fp32_mla_8x1VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp
new file mode 100644
index 0000000000..0a37f8abfc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/a64fx.cpp
@@ -0,0 +1,1143 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_8x1VL_a64fx (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const float *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x8\n"
+      "bge 85f\n"
+      "cmp %x[M], #0x6\n"
+      "bgt 73f\n"
+      "beq 61f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 49f\n"
+      "beq 37f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 25f\n"
+      "beq 13f\n"
+      "mov x13, %x[bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p0.s, x19, x12\n"
+      "cbz x13, 3f\n"
+      "ld1w { z24.s }, p1/Z, [x13]\n"
+      "addvl x13, x13, #1\n"
+      "b 5f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z24.s }, p0/Z, [x10]\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z24.b, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x9, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x19, x9, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "cbnz x9, 8f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x27, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "subs x28, x28, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop
+      "ld1w { z8.s }, p1/Z, [x11]\n"
+      "add x27, x27, #0x4\n"
+      "subs x28, x28, #0x1\n"
+      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "addvl x11, x11, #1\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z9.s }, p1/Z, [x11]\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x19\n"
+      "fmla z24.s, p1/M, z9.s, z0.s\n"
+      "addvl x11, x11, #1\n"
+      "bne 6b\n"
+      "tbz %x[flags], #1, 11f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x19]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "11:"  // Height 1: No activation
+      "st1w { z24.s }, p0, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "12:"  // Height 1: Writeback done
+      "decw x12\n"
+      "cmp x12, XZR\n"
+      "bgt 2b\n"
+      "b 98f\n"
+      "13:"  // Height 2
+      "mov x13, %x[bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[output_ptr]\n"
+      "14:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p0.s, x19, x12\n"
+      "cbz x13, 15f\n"
+      "ld1w { z24.s }, p1/Z, [x13]\n"
+      "mov z25.d, z24.d\n"
+      "addvl x13, x13, #1\n"
+      "b 17f\n"
+      "15:"  // Height 2: no bias
+      "tbz %x[flags], #0, 16f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x10]\n"
+      "ld1w { z25.s }, p0/Z, [x26]\n"
+      "b 17f\n"
+      "16:"  // Height 2: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x9, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x19, x9, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "cbnz x9, 20f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, x19, LSL #2\n"
+      "20:"  // Height 2: input setup done
+      "subs x28, x28, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "ble 22f\n"
+      "21:"  // Height 2: Multiply loop: Main loop
+      "ld1w { z8.s }, p1/Z, [x11]\n"
+      "add x27, x27, #0x4\n"
+      "subs x28, x28, #0x1\n"
+      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "add x26, x26, #0x4\n"
+      "fmla z25.s, p1/M, z8.s, z1.s\n"
+      "addvl x11, x11, #1\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "bgt 21b\n"
+      "22:"  // Height 2: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z9.s }, p1/Z, [x11]\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x19\n"
+      "fmla z24.s, p1/M, z9.s, z0.s\n"
+      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "addvl x11, x11, #1\n"
+      "bne 18b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "tbz %x[flags], #1, 23f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x19]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "23:"  // Height 2: No activation
+      "st1w { z24.s }, p0, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "st1w { z25.s }, p0, [x26]\n"
+      "24:"  // Height 2: Writeback done
+      "decw x12\n"
+      "cmp x12, XZR\n"
+      "bgt 14b\n"
+      "b 98f\n"
+      "25:"  // Height 3
+      "mov x13, %x[bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[output_ptr]\n"
+      "26:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p0.s, x19, x12\n"
+      "cbz x13, 27f\n"
+      "ld1w { z24.s }, p1/Z, [x13]\n"
+      "mov z25.d, z24.d\n"
+      "mov z26.d, z24.d\n"
+      "addvl x13, x13, #1\n"
+      "b 29f\n"
+      "27:"  // Height 3: no bias
+      "tbz %x[flags], #0, 28f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x10]\n"
+      "ld1w { z25.s }, p0/Z, [x26]\n"
+      "ld1w { z26.s }, p0/Z, [x25]\n"
+      "b 29f\n"
+      "28:"  // Height 3: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "29:"  // Height 3: setup done
+      "mov x9, #0x0\n"
+      "30:"  // Height 3: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x19, x9, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 31f\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "cbnz x9, 32f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 32f\n"
+      "31:"  // Height 3: setup direct input
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "32:"  // Height 3: input setup done
+      "subs x28, x28, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "ld1rw { z2.s }, p1/Z, [x25]\n"
+      "ble 34f\n"
+      "33:"  // Height 3: Multiply loop: Main loop
+      "ld1w { z8.s }, p1/Z, [x11]\n"
+      "add x27, x27, #0x4\n"
+      "subs x28, x28, #0x1\n"
+      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "add x26, x26, #0x4\n"
+      "add x25, x25, #0x4\n"
+      "fmla z25.s, p1/M, z8.s, z1.s\n"
+      "fmla z26.s, p1/M, z8.s, z2.s\n"
+      "addvl x11, x11, #1\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "ld1rw { z2.s }, p1/Z, [x25]\n"
+      "bgt 33b\n"
+      "34:"  // Height 3: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z9.s }, p1/Z, [x11]\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x19\n"
+      "fmla z24.s, p1/M, z9.s, z0.s\n"
+      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "addvl x11, x11, #1\n"
+      "fmla z26.s, p1/M, z9.s, z2.s\n"
+      "bne 30b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "tbz %x[flags], #1, 35f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x19]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmin z26.s, p1/M, z26.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "fmax z26.s, p1/M, z26.s, z16.s\n"
+      "35:"  // Height 3: No activation
+      "st1w { z24.s }, p0, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "st1w { z25.s }, p0, [x26]\n"
+      "st1w { z26.s }, p0, [x25]\n"
+      "36:"  // Height 3: Writeback done
+      "decw x12\n"
+      "cmp x12, XZR\n"
+      "bgt 26b\n"
+      "b 98f\n"
+      "37:"  // Height 4
+      "mov x13, %x[bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[output_ptr]\n"
+      "38:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p0.s, x19, x12\n"
+      "cbz x13, 39f\n"
+      "ld1w { z24.s }, p1/Z, [x13]\n"
+      "mov z25.d, z24.d\n"
+      "mov z26.d, z24.d\n"
+      "addvl x13, x13, #1\n"
+      "mov z27.d, z24.d\n"
+      "b 41f\n"
+      "39:"  // Height 4: no bias
+      "tbz %x[flags], #0, 40f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x10]\n"
+      "ld1w { z25.s }, p0/Z, [x26]\n"
+      "ld1w { z26.s }, p0/Z, [x25]\n"
+      "ld1w { z27.s }, p0/Z, [x24]\n"
+      "b 41f\n"
+      "40:"  // Height 4: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "41:"  // Height 4: setup done
+      "mov x9, #0x0\n"
+      "42:"  // Height 4: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x19, x9, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 43f\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "cbnz x9, 44f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "b 44f\n"
+      "43:"  // Height 4: setup direct input
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "44:"  // Height 4: input setup done
+      "subs x28, x28, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "ld1rw { z2.s }, p1/Z, [x25]\n"
+      "ld1rw { z3.s }, p1/Z, [x24]\n"
+      "ble 46f\n"
+      "45:"  // Height 4: Multiply loop: Main loop
+      "ld1w { z8.s }, p1/Z, [x11]\n"
+      "add x27, x27, #0x4\n"
+      "subs x28, x28, #0x1\n"
+      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "add x26, x26, #0x4\n"
+      "add x25, x25, #0x4\n"
+      "fmla z25.s, p1/M, z8.s, z1.s\n"
+      "fmla z26.s, p1/M, z8.s, z2.s\n"
+      "add x24, x24, #0x4\n"
+      "fmla z27.s, p1/M, z8.s, z3.s\n"
+      "addvl x11, x11, #1\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "ld1rw { z2.s }, p1/Z, [x25]\n"
+      "ld1rw { z3.s }, p1/Z, [x24]\n"
+      "bgt 45b\n"
+      "46:"  // Height 4: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z9.s }, p1/Z, [x11]\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x19\n"
+      "fmla z24.s, p1/M, z9.s, z0.s\n"
+      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "addvl x11, x11, #1\n"
+      "fmla z26.s, p1/M, z9.s, z2.s\n"
+      "fmla z27.s, p1/M, z9.s, z3.s\n"
+      "bne 42b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "tbz %x[flags], #1, 47f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x19]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmin z26.s, p1/M, z26.s, z17.s\n"
+      "fmin z27.s, p1/M, z27.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "fmax z26.s, p1/M, z26.s, z16.s\n"
+      "fmax z27.s, p1/M, z27.s, z16.s\n"
+      "47:"  // Height 4: No activation
+      "st1w { z24.s }, p0, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "st1w { z25.s }, p0, [x26]\n"
+      "st1w { z26.s }, p0, [x25]\n"
+      "st1w { z27.s }, p0, [x24]\n"
+      "48:"  // Height 4: Writeback done
+      "decw x12\n"
+      "cmp x12, XZR\n"
+      "bgt 38b\n"
+      "b 98f\n"
+      "49:"  // Height 5
+      "mov x13, %x[bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[output_ptr]\n"
+      "50:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p0.s, x19, x12\n"
+      "cbz x13, 51f\n"
+      "ld1w { z24.s }, p1/Z, [x13]\n"
+      "mov z25.d, z24.d\n"
+      "mov z26.d, z24.d\n"
+      "addvl x13, x13, #1\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "b 53f\n"
+      "51:"  // Height 5: no bias
+      "tbz %x[flags], #0, 52f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x10]\n"
+      "ld1w { z25.s }, p0/Z, [x26]\n"
+      "ld1w { z26.s }, p0/Z, [x25]\n"
+      "ld1w { z27.s }, p0/Z, [x24]\n"
+      "ld1w { z28.s }, p0/Z, [x23]\n"
+      "b 53f\n"
+      "52:"  // Height 5: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "53:"  // Height 5: setup done
+      "mov x9, #0x0\n"
+      "54:"  // Height 5: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x19, x9, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 55f\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x23, [x20, #0x20]\n"
+      "cbnz x9, 56f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 56f\n"
+      "55:"  // Height 5: setup direct input
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "56:"  // Height 5: input setup done
+      "subs x28, x28, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "ld1rw { z2.s }, p1/Z, [x25]\n"
+      "ld1rw { z3.s }, p1/Z, [x24]\n"
+      "ld1rw { z4.s }, p1/Z, [x23]\n"
+      "ble 58f\n"
+      "57:"  // Height 5: Multiply loop: Main loop
+      "ld1w { z8.s }, p1/Z, [x11]\n"
+      "add x27, x27, #0x4\n"
+      "subs x28, x28, #0x1\n"
+      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "add x26, x26, #0x4\n"
+      "add x25, x25, #0x4\n"
+      "fmla z25.s, p1/M, z8.s, z1.s\n"
+      "fmla z26.s, p1/M, z8.s, z2.s\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "fmla z27.s, p1/M, z8.s, z3.s\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "addvl x11, x11, #1\n"
+      "fmla z28.s, p1/M, z8.s, z4.s\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "ld1rw { z2.s }, p1/Z, [x25]\n"
+      "ld1rw { z3.s }, p1/Z, [x24]\n"
+      "ld1rw { z4.s }, p1/Z, [x23]\n"
+      "bgt 57b\n"
+      "58:"  // Height 5: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z9.s }, p1/Z, [x11]\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x19\n"
+      "fmla z24.s, p1/M, z9.s, z0.s\n"
+      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "addvl x11, x11, #1\n"
+      "fmla z26.s, p1/M, z9.s, z2.s\n"
+      "fmla z27.s, p1/M, z9.s, z3.s\n"
+      "fmla z28.s, p1/M, z9.s, z4.s\n"
+      "bne 54b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "tbz %x[flags], #1, 59f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x19]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmin z26.s, p1/M, z26.s, z17.s\n"
+      "fmin z27.s, p1/M, z27.s, z17.s\n"
+      "fmin z28.s, p1/M, z28.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "fmax z26.s, p1/M, z26.s, z16.s\n"
+      "fmax z27.s, p1/M, z27.s, z16.s\n"
+      "fmax z28.s, p1/M, z28.s, z16.s\n"
+      "59:"  // Height 5: No activation
+      "st1w { z24.s }, p0, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "st1w { z25.s }, p0, [x26]\n"
+      "st1w { z26.s }, p0, [x25]\n"
+      "st1w { z27.s }, p0, [x24]\n"
+      "st1w { z28.s }, p0, [x23]\n"
+      "60:"  // Height 5: Writeback done
+      "decw x12\n"
+      "cmp x12, XZR\n"
+      "bgt 50b\n"
+      "b 98f\n"
+      "61:"  // Height 6
+      "mov x13, %x[bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[output_ptr]\n"
+      "62:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p0.s, x19, x12\n"
+      "cbz x13, 63f\n"
+      "ld1w { z24.s }, p1/Z, [x13]\n"
+      "mov z25.d, z24.d\n"
+      "mov z26.d, z24.d\n"
+      "addvl x13, x13, #1\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "mov z29.d, z24.d\n"
+      "b 65f\n"
+      "63:"  // Height 6: no bias
+      "tbz %x[flags], #0, 64f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x10]\n"
+      "ld1w { z25.s }, p0/Z, [x26]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z26.s }, p0/Z, [x25]\n"
+      "ld1w { z27.s }, p0/Z, [x24]\n"
+      "ld1w { z28.s }, p0/Z, [x23]\n"
+      "ld1w { z29.s }, p0/Z, [x22]\n"
+      "b 65f\n"
+      "64:"  // Height 6: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "65:"  // Height 6: setup done
+      "mov x9, #0x0\n"
+      "66:"  // Height 6: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x19, x9, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x23, [x20, #0x20]\n"
+      "ldr x22, [x20, #0x28]\n"
+      "cbnz x9, 68f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 68f\n"
+      "67:"  // Height 6: setup direct input
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "68:"  // Height 6: input setup done
+      "subs x28, x28, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "ld1rw { z2.s }, p1/Z, [x25]\n"
+      "ld1rw { z3.s }, p1/Z, [x24]\n"
+      "ld1rw { z4.s }, p1/Z, [x23]\n"
+      "ld1rw { z5.s }, p1/Z, [x22]\n"
+      "ble 70f\n"
+      "69:"  // Height 6: Multiply loop: Main loop
+      "ld1w { z8.s }, p1/Z, [x11]\n"
+      "add x27, x27, #0x4\n"
+      "subs x28, x28, #0x1\n"
+      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "add x26, x26, #0x4\n"
+      "add x25, x25, #0x4\n"
+      "fmla z25.s, p1/M, z8.s, z1.s\n"
+      "fmla z26.s, p1/M, z8.s, z2.s\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "fmla z27.s, p1/M, z8.s, z3.s\n"
+      "fmla z28.s, p1/M, z8.s, z4.s\n"
+      "add x22, x22, #0x4\n"
+      "addvl x11, x11, #1\n"
+      "fmla z29.s, p1/M, z8.s, z5.s\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "ld1rw { z2.s }, p1/Z, [x25]\n"
+      "ld1rw { z3.s }, p1/Z, [x24]\n"
+      "ld1rw { z4.s }, p1/Z, [x23]\n"
+      "ld1rw { z5.s }, p1/Z, [x22]\n"
+      "bgt 69b\n"
+      "70:"  // Height 6: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z9.s }, p1/Z, [x11]\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x19\n"
+      "fmla z24.s, p1/M, z9.s, z0.s\n"
+      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "addvl x11, x11, #1\n"
+      "fmla z26.s, p1/M, z9.s, z2.s\n"
+      "fmla z27.s, p1/M, z9.s, z3.s\n"
+      "fmla z28.s, p1/M, z9.s, z4.s\n"
+      "fmla z29.s, p1/M, z9.s, z5.s\n"
+      "bne 66b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "tbz %x[flags], #1, 71f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x19]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmin z26.s, p1/M, z26.s, z17.s\n"
+      "fmin z27.s, p1/M, z27.s, z17.s\n"
+      "fmin z28.s, p1/M, z28.s, z17.s\n"
+      "fmin z29.s, p1/M, z29.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "fmax z26.s, p1/M, z26.s, z16.s\n"
+      "fmax z27.s, p1/M, z27.s, z16.s\n"
+      "fmax z28.s, p1/M, z28.s, z16.s\n"
+      "fmax z29.s, p1/M, z29.s, z16.s\n"
+      "71:"  // Height 6: No activation
+      "st1w { z24.s }, p0, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "st1w { z25.s }, p0, [x26]\n"
+      "st1w { z26.s }, p0, [x25]\n"
+      "st1w { z27.s }, p0, [x24]\n"
+      "st1w { z28.s }, p0, [x23]\n"
+      "st1w { z29.s }, p0, [x22]\n"
+      "72:"  // Height 6: Writeback done
+      "decw x12\n"
+      "cmp x12, XZR\n"
+      "bgt 62b\n"
+      "b 98f\n"
+      "73:"  // Height 7
+      "mov x13, %x[bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[output_ptr]\n"
+      "74:"  // Height 7: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p0.s, x19, x12\n"
+      "cbz x13, 75f\n"
+      "ld1w { z24.s }, p1/Z, [x13]\n"
+      "mov z25.d, z24.d\n"
+      "mov z26.d, z24.d\n"
+      "addvl x13, x13, #1\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "mov z29.d, z24.d\n"
+      "mov z30.d, z24.d\n"
+      "b 77f\n"
+      "75:"  // Height 7: no bias
+      "tbz %x[flags], #0, 76f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x10]\n"
+      "ld1w { z25.s }, p0/Z, [x26]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z26.s }, p0/Z, [x25]\n"
+      "ld1w { z27.s }, p0/Z, [x24]\n"
+      "ld1w { z28.s }, p0/Z, [x23]\n"
+      "ld1w { z29.s }, p0/Z, [x22]\n"
+      "ld1w { z30.s }, p0/Z, [x21]\n"
+      "b 77f\n"
+      "76:"  // Height 7: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "77:"  // Height 7: setup done
+      "mov x9, #0x0\n"
+      "78:"  // Height 7: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x19, x9, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 79f\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x23, [x20, #0x20]\n"
+      "ldr x22, [x20, #0x28]\n"
+      "ldr x21, [x20, #0x30]\n"
+      "cbnz x9, 80f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 80f\n"
+      "79:"  // Height 7: setup direct input
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "80:"  // Height 7: input setup done
+      "subs x28, x28, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "ld1rw { z2.s }, p1/Z, [x25]\n"
+      "ld1rw { z3.s }, p1/Z, [x24]\n"
+      "ld1rw { z4.s }, p1/Z, [x23]\n"
+      "ld1rw { z5.s }, p1/Z, [x22]\n"
+      "ld1rw { z6.s }, p1/Z, [x21]\n"
+      "ble 82f\n"
+      "81:"  // Height 7: Multiply loop: Main loop
+      "ld1w { z8.s }, p1/Z, [x11]\n"
+      "add x27, x27, #0x4\n"
+      "subs x28, x28, #0x1\n"
+      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "add x26, x26, #0x4\n"
+      "add x25, x25, #0x4\n"
+      "fmla z25.s, p1/M, z8.s, z1.s\n"
+      "fmla z26.s, p1/M, z8.s, z2.s\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "fmla z27.s, p1/M, z8.s, z3.s\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "add x22, x22, #0x4\n"
+      "add x21, x21, #0x4\n"
+      "fmla z28.s, p1/M, z8.s, z4.s\n"
+      "fmla z29.s, p1/M, z8.s, z5.s\n"
+      "addvl x11, x11, #1\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "fmla z30.s, p1/M, z8.s, z6.s\n"
+      "ld1rw { z2.s }, p1/Z, [x25]\n"
+      "ld1rw { z3.s }, p1/Z, [x24]\n"
+      "ld1rw { z4.s }, p1/Z, [x23]\n"
+      "ld1rw { z5.s }, p1/Z, [x22]\n"
+      "ld1rw { z6.s }, p1/Z, [x21]\n"
+      "bgt 81b\n"
+      "82:"  // Height 7: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z9.s }, p1/Z, [x11]\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x19\n"
+      "fmla z24.s, p1/M, z9.s, z0.s\n"
+      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "addvl x11, x11, #1\n"
+      "fmla z26.s, p1/M, z9.s, z2.s\n"
+      "fmla z27.s, p1/M, z9.s, z3.s\n"
+      "fmla z28.s, p1/M, z9.s, z4.s\n"
+      "fmla z29.s, p1/M, z9.s, z5.s\n"
+      "fmla z30.s, p1/M, z9.s, z6.s\n"
+      "bne 78b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "tbz %x[flags], #1, 83f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x19]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmin z26.s, p1/M, z26.s, z17.s\n"
+      "fmin z27.s, p1/M, z27.s, z17.s\n"
+      "fmin z28.s, p1/M, z28.s, z17.s\n"
+      "fmin z29.s, p1/M, z29.s, z17.s\n"
+      "fmin z30.s, p1/M, z30.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "fmax z26.s, p1/M, z26.s, z16.s\n"
+      "fmax z27.s, p1/M, z27.s, z16.s\n"
+      "fmax z28.s, p1/M, z28.s, z16.s\n"
+      "fmax z29.s, p1/M, z29.s, z16.s\n"
+      "fmax z30.s, p1/M, z30.s, z16.s\n"
+      "83:"  // Height 7: No activation
+      "st1w { z24.s }, p0, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "st1w { z25.s }, p0, [x26]\n"
+      "st1w { z26.s }, p0, [x25]\n"
+      "st1w { z27.s }, p0, [x24]\n"
+      "st1w { z28.s }, p0, [x23]\n"
+      "st1w { z29.s }, p0, [x22]\n"
+      "st1w { z30.s }, p0, [x21]\n"
+      "84:"  // Height 7: Writeback done
+      "decw x12\n"
+      "cmp x12, XZR\n"
+      "bgt 74b\n"
+      "b 98f\n"
+      "85:"  // Height 8
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x19, #0x20\n"
+      "mov x13, %x[bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x10, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "86:"  // Height 8: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p0.s, x19, x12\n"
+      "cbz x13, 87f\n"
+      "ld1w { z24.s }, p1/Z, [x13]\n"
+      "mov z25.d, z24.d\n"
+      "mov z26.d, z24.d\n"
+      "addvl x13, x13, #1\n"
+      "mov z27.d, z24.d\n"
+      "mov z28.d, z24.d\n"
+      "mov z29.d, z24.d\n"
+      "mov z30.d, z24.d\n"
+      "mov z31.d, z24.d\n"
+      "b 89f\n"
+      "87:"  // Height 8: no bias
+      "tbz %x[flags], #0, 88f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "ld1w { z24.s }, p0/Z, [x10]\n"
+      "ld1w { z25.s }, p0/Z, [x26]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z26.s }, p0/Z, [x25]\n"
+      "ld1w { z27.s }, p0/Z, [x24]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "ld1w { z28.s }, p0/Z, [x23]\n"
+      "ld1w { z29.s }, p0/Z, [x22]\n"
+      "ld1w { z30.s }, p0/Z, [x21]\n"
+      "ld1w { z31.s }, p0/Z, [x20]\n"
+      "b 89f\n"
+      "88:"  // Height 8: no accumulate
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "89:"  // Height 8: setup done
+      "mov x9, #0x0\n"
+      "90:"  // Height 8: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w28, [x19, x9, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 91f\n"
+      "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x27, [x20, #0x0]\n"
+      "ldr x26, [x20, #0x8]\n"
+      "ldr x25, [x20, #0x10]\n"
+      "ldr x24, [x20, #0x18]\n"
+      "ldr x23, [x20, #0x20]\n"
+      "ldr x22, [x20, #0x28]\n"
+      "ldr x21, [x20, #0x30]\n"
+      "ldr x20, [x20, #0x38]\n"
+      "cbnz x9, 92f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x27, x27, x19, LSL #2\n"
+      "add x26, x26, x19, LSL #2\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 92f\n"
+      "91:"  // Height 8: setup direct input
+      "mov x27, %x[input_ptr]\n"
+      "add x26, x27, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "92:"  // Height 8: input setup done
+      "subs x28, x28, #0x1\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "ld1rw { z2.s }, p1/Z, [x25]\n"
+      "ld1rw { z3.s }, p1/Z, [x24]\n"
+      "ld1rw { z4.s }, p1/Z, [x23]\n"
+      "ld1rw { z5.s }, p1/Z, [x22]\n"
+      "ld1rw { z6.s }, p1/Z, [x21]\n"
+      "ld1rw { z7.s }, p1/Z, [x20]\n"
+      "ble 94f\n"
+      "93:"  // Height 8: Multiply loop: Main loop
+      "ld1w { z8.s }, p1/Z, [x11]\n"
+      "add x27, x27, #0x4\n"
+      "subs x28, x28, #0x1\n"
+      "fmla z24.s, p1/M, z8.s, z0.s\n"
+      "add x26, x26, #0x4\n"
+      "add x25, x25, #0x4\n"
+      "fmla z25.s, p1/M, z8.s, z1.s\n"
+      "fmla z26.s, p1/M, z8.s, z2.s\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "fmla z27.s, p1/M, z8.s, z3.s\n"
+      "fmla z28.s, p1/M, z8.s, z4.s\n"
+      "add x22, x22, #0x4\n"
+      "add x21, x21, #0x4\n"
+      "fmla z29.s, p1/M, z8.s, z5.s\n"
+      "ld1rw { z0.s }, p1/Z, [x27]\n"
+      "add x20, x20, #0x4\n"
+      "addvl x11, x11, #1\n"
+      "ld1rw { z1.s }, p1/Z, [x26]\n"
+      "fmla z30.s, p1/M, z8.s, z6.s\n"
+      "fmla z31.s, p1/M, z8.s, z7.s\n"
+      "ld1rw { z2.s }, p1/Z, [x25]\n"
+      "ld1rw { z3.s }, p1/Z, [x24]\n"
+      "ld1rw { z4.s }, p1/Z, [x23]\n"
+      "ld1rw { z5.s }, p1/Z, [x22]\n"
+      "ld1rw { z6.s }, p1/Z, [x21]\n"
+      "ld1rw { z7.s }, p1/Z, [x20]\n"
+      "bgt 93b\n"
+      "94:"  // Height 8: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "ld1w { z9.s }, p1/Z, [x11]\n"
+      "add x9, x9, #0x1\n"
+      "cmp x9, x19\n"
+      "fmla z24.s, p1/M, z9.s, z0.s\n"
+      "fmla z25.s, p1/M, z9.s, z1.s\n"
+      "addvl x11, x11, #1\n"
+      "fmla z26.s, p1/M, z9.s, z2.s\n"
+      "fmla z27.s, p1/M, z9.s, z3.s\n"
+      "fmla z28.s, p1/M, z9.s, z4.s\n"
+      "fmla z29.s, p1/M, z9.s, z5.s\n"
+      "fmla z30.s, p1/M, z9.s, z6.s\n"
+      "fmla z31.s, p1/M, z9.s, z7.s\n"
+      "bne 90b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x26, x10, x19, LSL #2\n"
+      "add x25, x26, x19, LSL #2\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "tbz %x[flags], #1, 95f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z17.s }, p1/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z16.s }, p1/Z, [x19]\n"
+      "fmin z24.s, p1/M, z24.s, z17.s\n"
+      "fmin z25.s, p1/M, z25.s, z17.s\n"
+      "fmin z26.s, p1/M, z26.s, z17.s\n"
+      "fmin z27.s, p1/M, z27.s, z17.s\n"
+      "fmin z28.s, p1/M, z28.s, z17.s\n"
+      "fmin z29.s, p1/M, z29.s, z17.s\n"
+      "fmin z30.s, p1/M, z30.s, z17.s\n"
+      "fmin z31.s, p1/M, z31.s, z17.s\n"
+      "fmax z24.s, p1/M, z24.s, z16.s\n"
+      "fmax z25.s, p1/M, z25.s, z16.s\n"
+      "fmax z26.s, p1/M, z26.s, z16.s\n"
+      "fmax z27.s, p1/M, z27.s, z16.s\n"
+      "fmax z28.s, p1/M, z28.s, z16.s\n"
+      "fmax z29.s, p1/M, z29.s, z16.s\n"
+      "fmax z30.s, p1/M, z30.s, z16.s\n"
+      "fmax z31.s, p1/M, z31.s, z16.s\n"
+      "95:"  // Height 8: No activation
+      "st1w { z24.s }, p0, [x10]\n"
+      "addvl x10, x10, #1\n"
+      "st1w { z25.s }, p0, [x26]\n"
+      "st1w { z26.s }, p0, [x25]\n"
+      "st1w { z27.s }, p0, [x24]\n"
+      "st1w { z28.s }, p0, [x23]\n"
+      "st1w { z29.s }, p0, [x22]\n"
+      "st1w { z30.s }, p0, [x21]\n"
+      "st1w { z31.s }, p0, [x20]\n"
+      "96:"  // Height 8: Writeback done
+      "decw x12\n"
+      "cmp x12, XZR\n"
+      "bgt 86b\n"
+      "subs %x[M], %x[M], #0x8\n"
+      "beq 98f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 97f\n"
+      "add x20, x20, #0x8\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "97:"  // Update direct input
+      "mov x19, #0x20\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "98:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
index 943e0ac148..5b4b6b9b2e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -149,12 +149,11 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x27, x27, #0x10\n"
+      "cmp x28, #0x4\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
+      "add x27, x27, #0x10\n"
       "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
       "fmla z24.s, z11.s, z0.s[3]\n"
       "bgt 9b\n"
@@ -164,7 +163,6 @@ void sve_hybrid_fp32_mla_8x1VL (
       "subs x28, x28, #0x1\n"
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
-      "add x27, x27, #0x10\n"
       "addvl x12, x12, #1\n"
       "ble 11f\n"
       "ld1w { z9.s }, p2/Z, [x12]\n"
@@ -181,9 +179,8 @@ void sve_hybrid_fp32_mla_8x1VL (
       "fmla z24.s, z11.s, z0.s[3]\n"
       "addvl x12, x12, #1\n"
       "11:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x9, x9, #0x1\n"
       "cmp x9, x19\n"
       "bne 6b\n"
       "tbz %x[flags], #1, 12f\n"
@@ -254,18 +251,16 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
+      "cmp x28, #0x4\n"
       "fmla z25.s, z8.s, z1.s[0]\n"
       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x26, x26, #0x10\n"
+      "add x27, x27, #0x10\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
+      "add x26, x26, #0x10\n"
       "fmla z25.s, z9.s, z1.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
       "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla z25.s, z10.s, z1.s[2]\n"
       "fmla z24.s, z11.s, z0.s[3]\n"
       "fmla z25.s, z11.s, z1.s[3]\n"
@@ -277,16 +272,14 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
-      "fmla z25.s, z8.s, z1.s[0]\n"
-      "add x26, x26, #0x10\n"
       "addvl x12, x12, #1\n"
+      "fmla z25.s, z8.s, z1.s[0]\n"
       "ble 24f\n"
       "ld1w { z9.s }, p2/Z, [x12]\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
       "subs x28, x28, #0x1\n"
-      "fmla z25.s, z9.s, z1.s[1]\n"
       "addvl x12, x12, #1\n"
+      "fmla z25.s, z9.s, z1.s[1]\n"
       "ble 24f\n"
       "ld1w { z10.s }, p2/Z, [x12]\n"
       "fmla z24.s, z10.s, z0.s[2]\n"
@@ -299,10 +292,8 @@ void sve_hybrid_fp32_mla_8x1VL (
       "addvl x12, x12, #1\n"
       "fmla z25.s, z11.s, z1.s[3]\n"
       "24:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x9, x9, #0x1\n"
       "cmp x9, x19\n"
       "bne 19b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -385,22 +376,19 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
+      "cmp x28, #0x4\n"
       "fmla z25.s, z8.s, z1.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
+      "add x27, x27, #0x10\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
+      "add x26, x26, #0x10\n"
       "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
+      "add x25, x25, #0x10\n"
       "fmla z25.s, z9.s, z1.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
       "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla z26.s, z9.s, z2.s[1]\n"
       "fmla z25.s, z10.s, z1.s[2]\n"
       "fmla z24.s, z11.s, z0.s[3]\n"
@@ -415,13 +403,10 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
+      "addvl x12, x12, #1\n"
       "fmla z25.s, z8.s, z1.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
       "fmla z26.s, z8.s, z2.s[0]\n"
-      "add x25, x25, #0x10\n"
-      "addvl x12, x12, #1\n"
       "ble 37f\n"
       "ld1w { z9.s }, p2/Z, [x12]\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
@@ -443,11 +428,8 @@ void sve_hybrid_fp32_mla_8x1VL (
       "fmla z25.s, z11.s, z1.s[3]\n"
       "fmla z26.s, z11.s, z2.s[3]\n"
       "37:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x9, x9, #0x1\n"
       "cmp x9, x19\n"
       "bne 32b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -541,27 +523,23 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
+      "cmp x28, #0x4\n"
       "fmla z25.s, z8.s, z1.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
+      "add x27, x27, #0x10\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
       "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "add x26, x26, #0x10\n"
       "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "fmla z27.s, z8.s, z3.s[0]\n"
       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
+      "add x24, x24, #0x10\n"
       "fmla z25.s, z9.s, z1.s[1]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
       "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla z26.s, z9.s, z2.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla z27.s, z9.s, z3.s[1]\n"
       "fmla z25.s, z10.s, z1.s[2]\n"
       "fmla z24.s, z11.s, z0.s[3]\n"
@@ -578,16 +556,12 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
+      "addvl x12, x12, #1\n"
       "fmla z25.s, z8.s, z1.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
       "fmla z27.s, z8.s, z3.s[0]\n"
-      "add x24, x24, #0x10\n"
-      "addvl x12, x12, #1\n"
       "ble 50f\n"
       "ld1w { z9.s }, p2/Z, [x12]\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
@@ -612,12 +586,8 @@ void sve_hybrid_fp32_mla_8x1VL (
       "fmla z26.s, z11.s, z2.s[3]\n"
       "fmla z27.s, z11.s, z3.s[3]\n"
       "50:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x9, x9, #0x1\n"
       "cmp x9, x19\n"
       "bne 45b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -722,33 +692,28 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
+      "cmp x28, #0x4\n"
       "fmla z25.s, z8.s, z1.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
+      "add x27, x27, #0x10\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
       "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "add x26, x26, #0x10\n"
       "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1rqw { z4.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "fmla z27.s, z8.s, z3.s[0]\n"
       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla z25.s, z9.s, z1.s[1]\n"
       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
+      "add x23, x23, #0x10\n"
       "fmla z28.s, z8.s, z4.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
       "fmla z26.s, z9.s, z2.s[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla z27.s, z9.s, z3.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla z25.s, z10.s, z1.s[2]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla z28.s, z9.s, z4.s[1]\n"
       "fmla z26.s, z10.s, z2.s[2]\n"
       "fmla z27.s, z10.s, z3.s[2]\n"
@@ -766,19 +731,14 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
+      "addvl x12, x12, #1\n"
       "fmla z25.s, z8.s, z1.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1rqw { z4.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
       "fmla z28.s, z8.s, z4.s[0]\n"
-      "add x23, x23, #0x10\n"
-      "addvl x12, x12, #1\n"
       "ble 63f\n"
       "ld1w { z9.s }, p2/Z, [x12]\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
@@ -806,13 +766,8 @@ void sve_hybrid_fp32_mla_8x1VL (
       "fmla z27.s, z11.s, z3.s[3]\n"
       "fmla z28.s, z11.s, z4.s[3]\n"
       "63:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x9, x9, #0x1\n"
       "cmp x9, x19\n"
       "bne 58b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -928,38 +883,32 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
+      "cmp x28, #0x4\n"
       "fmla z25.s, z8.s, z1.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
+      "add x27, x27, #0x10\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
       "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "add x26, x26, #0x10\n"
       "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1rqw { z4.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "fmla z27.s, z8.s, z3.s[0]\n"
       "ld1rqw { z5.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla z25.s, z9.s, z1.s[1]\n"
       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla z28.s, z8.s, z4.s[0]\n"
       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
+      "add x22, x22, #0x10\n"
       "fmla z29.s, z8.s, z5.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
       "fmla z26.s, z9.s, z2.s[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla z27.s, z9.s, z3.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla z28.s, z9.s, z4.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla z29.s, z9.s, z5.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla z25.s, z10.s, z1.s[2]\n"
       "fmla z26.s, z10.s, z2.s[2]\n"
       "fmla z27.s, z10.s, z3.s[2]\n"
@@ -979,22 +928,16 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
+      "addvl x12, x12, #1\n"
       "fmla z25.s, z8.s, z1.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1rqw { z4.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
       "ld1rqw { z5.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "fmla z28.s, z8.s, z4.s[0]\n"
       "fmla z29.s, z8.s, z5.s[0]\n"
-      "add x22, x22, #0x10\n"
-      "addvl x12, x12, #1\n"
       "ble 76f\n"
       "ld1w { z9.s }, p2/Z, [x12]\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
@@ -1025,14 +968,8 @@ void sve_hybrid_fp32_mla_8x1VL (
       "fmla z28.s, z11.s, z4.s[3]\n"
       "fmla z29.s, z11.s, z5.s[3]\n"
       "76:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x9, x9, #0x1\n"
       "cmp x9, x19\n"
       "bne 71b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1159,43 +1096,36 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
+      "cmp x28, #0x4\n"
       "fmla z25.s, z8.s, z1.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
+      "add x27, x27, #0x10\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
       "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "add x26, x26, #0x10\n"
       "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1rqw { z4.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "fmla z27.s, z8.s, z3.s[0]\n"
       "ld1rqw { z5.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla z25.s, z9.s, z1.s[1]\n"
       "ld1rqw { z6.s }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla z28.s, z8.s, z4.s[0]\n"
       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       "fmla z29.s, z8.s, z5.s[0]\n"
       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
+      "add x21, x21, #0x10\n"
       "fmla z30.s, z8.s, z6.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
       "fmla z26.s, z9.s, z2.s[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla z27.s, z9.s, z3.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla z28.s, z9.s, z4.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla z29.s, z9.s, z5.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla z30.s, z9.s, z6.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "fmla z25.s, z10.s, z1.s[2]\n"
       "fmla z26.s, z10.s, z2.s[2]\n"
       "fmla z27.s, z10.s, z3.s[2]\n"
@@ -1217,25 +1147,18 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
+      "addvl x12, x12, #1\n"
       "fmla z25.s, z8.s, z1.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1rqw { z4.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
       "ld1rqw { z5.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
       "ld1rqw { z6.s }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "fmla z28.s, z8.s, z4.s[0]\n"
+      "fmla z29.s, z8.s, z5.s[0]\n"
       "fmla z30.s, z8.s, z6.s[0]\n"
-      "add x21, x21, #0x10\n"
-      "addvl x12, x12, #1\n"
       "ble 89f\n"
       "ld1w { z9.s }, p2/Z, [x12]\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
@@ -1269,15 +1192,8 @@ void sve_hybrid_fp32_mla_8x1VL (
       "fmla z29.s, z11.s, z5.s[3]\n"
       "fmla z30.s, z11.s, z6.s[3]\n"
       "89:"  // Height 7: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x9, x9, #0x1\n"
       "cmp x9, x19\n"
       "bne 84b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1418,48 +1334,40 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
+      "cmp x28, #0x4\n"
       "fmla z25.s, z8.s, z1.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
+      "add x27, x27, #0x10\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
       "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "add x26, x26, #0x10\n"
       "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1rqw { z4.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "fmla z27.s, z8.s, z3.s[0]\n"
       "ld1rqw { z5.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "fmla z25.s, z9.s, z1.s[1]\n"
       "ld1rqw { z6.s }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "fmla z28.s, z8.s, z4.s[0]\n"
       "ld1rqw { z7.s }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       "fmla z29.s, z8.s, z5.s[0]\n"
       "ld1w { z10.s }, p2/Z, [x12, #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
+      "add x21, x21, #0x10\n"
       "fmla z30.s, z8.s, z6.s[0]\n"
       "ld1w { z11.s }, p2/Z, [x12, #3, MUL VL]\n"
-      "cmp x28, #0x4\n"
+      "add x20, x20, #0x10\n"
       "fmla z31.s, z8.s, z7.s[0]\n"
-      "prfm pldl1keep, [x27, #0x80]\n"
       "addvl x12, x12, #4\n"
       "fmla z26.s, z9.s, z2.s[1]\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
       "fmla z27.s, z9.s, z3.s[1]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "fmla z28.s, z9.s, z4.s[1]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "fmla z29.s, z9.s, z5.s[1]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "fmla z30.s, z9.s, z6.s[1]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "fmla z31.s, z9.s, z7.s[1]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "fmla z24.s, z10.s, z0.s[2]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "fmla z25.s, z10.s, z1.s[2]\n"
       "fmla z26.s, z10.s, z2.s[2]\n"
       "fmla z27.s, z10.s, z3.s[2]\n"
@@ -1483,28 +1391,20 @@ void sve_hybrid_fp32_mla_8x1VL (
       "ld1rqw { z0.s }, p0/Z, [x27]\n"
       "fmla z24.s, z8.s, z0.s[0]\n"
       "ld1rqw { z1.s }, p0/Z, [x26]\n"
-      "add x27, x27, #0x10\n"
+      "addvl x12, x12, #1\n"
       "fmla z25.s, z8.s, z1.s[0]\n"
       "ld1rqw { z2.s }, p0/Z, [x25]\n"
-      "add x26, x26, #0x10\n"
-      "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1rqw { z3.s }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
-      "fmla z27.s, z8.s, z3.s[0]\n"
+      "fmla z26.s, z8.s, z2.s[0]\n"
       "ld1rqw { z4.s }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "fmla z28.s, z8.s, z4.s[0]\n"
+      "fmla z27.s, z8.s, z3.s[0]\n"
       "ld1rqw { z5.s }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "fmla z29.s, z8.s, z5.s[0]\n"
       "ld1rqw { z6.s }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "fmla z30.s, z8.s, z6.s[0]\n"
+      "fmla z28.s, z8.s, z4.s[0]\n"
       "ld1rqw { z7.s }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      "fmla z29.s, z8.s, z5.s[0]\n"
+      "fmla z30.s, z8.s, z6.s[0]\n"
       "fmla z31.s, z8.s, z7.s[0]\n"
-      "add x20, x20, #0x10\n"
-      "addvl x12, x12, #1\n"
       "ble 102f\n"
       "ld1w { z9.s }, p2/Z, [x12]\n"
       "fmla z24.s, z9.s, z0.s[1]\n"
@@ -1541,16 +1441,8 @@ void sve_hybrid_fp32_mla_8x1VL (
       "fmla z30.s, z11.s, z6.s[3]\n"
       "fmla z31.s, z11.s, z7.s[3]\n"
       "102:"  // Height 8: Multiply loop: multiply skip
-      "prfm pldl1keep, [x27, #0x80]\n"
-      "add x9, x9, #0x1\n"
-      "prfm pldl1keep, [x26, #0x80]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x9, x9, #0x1\n"
       "cmp x9, x19\n"
       "bne 97b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
new file mode 100644
index 0000000000..2142f1067d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL.hpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_fp32bf16fp32_mmla_4x6VL( ARGLIST );
+
+class cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL
+{
+public:
+    typedef float lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 6;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 4, 12, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 16.63 };
+                case CPUModel::A510:
+                    return { 5.42 };
+                case CPUModel::V1:
+                    return { 28.40 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_fp32bf16fp32_mmla_4x6VL;
+    cls_sve_hybrid_fp32bf16fp32_mmla_4x6VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
new file mode 100644
index 0000000000..43b0f54805
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_4x6VL/generic.cpp
@@ -0,0 +1,1306 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32bf16fp32_mmla_4x6VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p7.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 40f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 27f\n"
+      "beq 14f\n"
+      "mov x9, %x[bias]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p6.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p5.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p4.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x28\n"
+      "cbz x9, 3f\n"
+      "ld1w { z8.s }, p7/Z, [x9]\n"
+      "ld1w { z9.s }, p7/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p7/Z, [x9, #2, MUL VL]\n"
+      "zip2 z14.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z11.s }, p7/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p7/Z, [x9, #4, MUL VL]\n"
+      "zip2 z15.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "ld1w { z13.s }, p7/Z, [x9, #5, MUL VL]\n"
+      "zip2 z16.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x9, x9, #6\n"
+      "zip2 z17.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "zip2 z18.d, z12.d, z12.d\n"
+      "zip1 z12.d, z12.d, z12.d\n"
+      "zip2 z19.d, z13.d, z13.d\n"
+      "zip1 z13.d, z13.d, z13.d\n"
+      "b 5f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z9.s }, p6/Z, [x26]\n"
+      "ld1w { z10.s }, p5/Z, [x26, #1, MUL VL]\n"
+      "zip1 z8.d, z9.d, z14.d\n"
+      "ld1w { z11.s }, p4/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x26, #3, MUL VL]\n"
+      "zip2 z14.d, z9.d, z14.d\n"
+      "zip1 z9.d, z10.d, z15.d\n"
+      "ld1w { z13.s }, p2/Z, [x26, #4, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x26, #5, MUL VL]\n"
+      "zip2 z15.d, z10.d, z15.d\n"
+      "zip1 z10.d, z11.d, z16.d\n"
+      "zip2 z16.d, z11.d, z16.d\n"
+      "zip1 z11.d, z12.d, z17.d\n"
+      "zip2 z17.d, z12.d, z17.d\n"
+      "zip1 z12.d, z13.d, z18.d\n"
+      "zip2 z18.d, z13.d, z18.d\n"
+      "zip1 z13.d, z20.d, z19.d\n"
+      "zip2 z19.d, z20.d, z19.d\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x25, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w24, [x19, x25, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "cbnz x25, 8f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "cmp x24, #0x4\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x24\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z4.h }, p7/Z, [x27]\n"
+      "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n"
+      "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n"
+      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n"
+      "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n"
+      "sub x24, x24, #0x4\n"
+      "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n"
+      "cmp x24, #0x4\n"
+      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      "add x23, x23, #0x10\n"
+      "addvl x27, x27, #-4\n"
+      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x24\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z4.h }, p7/Z, [x27]\n"
+      "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n"
+      "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n"
+      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n"
+      "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n"
+      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n"
+      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      "addvl x27, x27, #-4\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 6b\n"
+      "uzp1 z8.d, z8.d, z14.d\n"
+      "uzp1 z9.d, z9.d, z15.d\n"
+      "uzp1 z10.d, z10.d, z16.d\n"
+      "uzp1 z11.d, z11.d, z17.d\n"
+      "uzp1 z12.d, z12.d, z18.d\n"
+      "uzp1 z13.d, z13.d, z19.d\n"
+      "tbz %x[flags], #1, 12f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p7/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p7/Z, [x19]\n"
+      "fmin z8.s, p7/M, z8.s, z1.s\n"
+      "fmin z9.s, p7/M, z9.s, z1.s\n"
+      "fmin z10.s, p7/M, z10.s, z1.s\n"
+      "fmin z11.s, p7/M, z11.s, z1.s\n"
+      "fmin z12.s, p7/M, z12.s, z1.s\n"
+      "fmin z13.s, p7/M, z13.s, z1.s\n"
+      "fmax z8.s, p7/M, z8.s, z0.s\n"
+      "fmax z9.s, p7/M, z9.s, z0.s\n"
+      "fmax z10.s, p7/M, z10.s, z0.s\n"
+      "fmax z11.s, p7/M, z11.s, z0.s\n"
+      "fmax z12.s, p7/M, z12.s, z0.s\n"
+      "fmax z13.s, p7/M, z13.s, z0.s\n"
+      "12:"  // Height 1: No activation
+      "st1w { z8.s }, p6, [x26]\n"
+      "st1w { z9.s }, p5, [x26, #1, MUL VL]\n"
+      "st1w { z10.s }, p4, [x26, #2, MUL VL]\n"
+      "st1w { z11.s }, p3, [x26, #3, MUL VL]\n"
+      "st1w { z12.s }, p2, [x26, #4, MUL VL]\n"
+      "st1w { z13.s }, p1, [x26, #5, MUL VL]\n"
+      "addvl x26, x26, #6\n"
+      "13:"  // Height 1: Writeback done
+      "decw x28, ALL, MUL #6\n"
+      "cmp x28, XZR\n"
+      "bgt 2b\n"
+      "b 54f\n"
+      "14:"  // Height 2
+      "mov x9, %x[bias]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "15:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p6.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p5.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p4.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x28\n"
+      "cbz x9, 16f\n"
+      "ld1w { z8.s }, p7/Z, [x9]\n"
+      "ld1w { z9.s }, p7/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p7/Z, [x9, #2, MUL VL]\n"
+      "zip2 z14.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z11.s }, p7/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p7/Z, [x9, #4, MUL VL]\n"
+      "zip2 z15.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "ld1w { z13.s }, p7/Z, [x9, #5, MUL VL]\n"
+      "zip2 z16.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x9, x9, #6\n"
+      "zip2 z17.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "zip2 z18.d, z12.d, z12.d\n"
+      "zip1 z12.d, z12.d, z12.d\n"
+      "zip2 z19.d, z13.d, z13.d\n"
+      "zip1 z13.d, z13.d, z13.d\n"
+      "b 18f\n"
+      "16:"  // Height 2: no bias
+      "tbz %x[flags], #0, 17f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x26, x19, LSL #2\n"
+      "ld1w { z9.s }, p6/Z, [x26]\n"
+      "ld1w { z10.s }, p5/Z, [x26, #1, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x26, #3, MUL VL]\n"
+      "ld1w { z13.s }, p2/Z, [x26, #4, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x26, #5, MUL VL]\n"
+      "ld1w { z14.s }, p6/Z, [x22]\n"
+      "zip1 z8.d, z9.d, z14.d\n"
+      "zip2 z14.d, z9.d, z14.d\n"
+      "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n"
+      "zip1 z9.d, z10.d, z15.d\n"
+      "zip2 z15.d, z10.d, z15.d\n"
+      "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "zip1 z10.d, z11.d, z16.d\n"
+      "zip2 z16.d, z11.d, z16.d\n"
+      "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n"
+      "zip1 z11.d, z12.d, z17.d\n"
+      "zip2 z17.d, z12.d, z17.d\n"
+      "zip1 z12.d, z13.d, z18.d\n"
+      "zip2 z18.d, z13.d, z18.d\n"
+      "zip1 z13.d, z20.d, z19.d\n"
+      "zip2 z19.d, z20.d, z19.d\n"
+      "b 18f\n"
+      "17:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "18:"  // Height 2: setup done
+      "mov x25, #0x0\n"
+      "19:"  // Height 2: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w24, [x19, x25, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "cbnz x25, 21f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 21f\n"
+      "20:"  // Height 2: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "21:"  // Height 2: input setup done
+      "cmp x24, #0x4\n"
+      "ble 23f\n"
+      "22:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x24\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      "ld1rqw { z1.s }, p0/Z, [x22]\n"
+      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
+      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z4.h }, p7/Z, [x27]\n"
+      "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n"
+      "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
+      "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n"
+      "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n"
+      "sub x24, x24, #0x4\n"
+      "cmp x24, #0x4\n"
+      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      "addvl x27, x27, #-4\n"
+      "bgt 22b\n"
+      "23:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x24\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      "ld1rqw { z1.s }, p0/Z, [x22]\n"
+      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
+      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z4.h }, p7/Z, [x27]\n"
+      "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n"
+      "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
+      "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n"
+      "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n"
+      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
+      "addvl x27, x27, #-4\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      "24:"  // Height 2: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 19b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 z4.d, z8.d, z14.d\n"
+      "uzp2 z8.d, z8.d, z14.d\n"
+      "add x22, x26, x19, LSL #2\n"
+      "uzp1 z14.d, z9.d, z15.d\n"
+      "uzp2 z9.d, z9.d, z15.d\n"
+      "uzp1 z15.d, z10.d, z16.d\n"
+      "uzp2 z10.d, z10.d, z16.d\n"
+      "uzp1 z16.d, z11.d, z17.d\n"
+      "uzp2 z11.d, z11.d, z17.d\n"
+      "uzp1 z17.d, z12.d, z18.d\n"
+      "uzp2 z12.d, z12.d, z18.d\n"
+      "uzp1 z18.d, z13.d, z19.d\n"
+      "uzp2 z13.d, z13.d, z19.d\n"
+      "tbz %x[flags], #1, 25f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p7/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p7/Z, [x19]\n"
+      "fmin z4.s, p7/M, z4.s, z1.s\n"
+      "fmin z14.s, p7/M, z14.s, z1.s\n"
+      "fmin z15.s, p7/M, z15.s, z1.s\n"
+      "fmin z16.s, p7/M, z16.s, z1.s\n"
+      "fmin z17.s, p7/M, z17.s, z1.s\n"
+      "fmin z18.s, p7/M, z18.s, z1.s\n"
+      "fmin z8.s, p7/M, z8.s, z1.s\n"
+      "fmin z9.s, p7/M, z9.s, z1.s\n"
+      "fmin z10.s, p7/M, z10.s, z1.s\n"
+      "fmin z11.s, p7/M, z11.s, z1.s\n"
+      "fmin z12.s, p7/M, z12.s, z1.s\n"
+      "fmin z13.s, p7/M, z13.s, z1.s\n"
+      "fmax z4.s, p7/M, z4.s, z0.s\n"
+      "fmax z14.s, p7/M, z14.s, z0.s\n"
+      "fmax z15.s, p7/M, z15.s, z0.s\n"
+      "fmax z16.s, p7/M, z16.s, z0.s\n"
+      "fmax z17.s, p7/M, z17.s, z0.s\n"
+      "fmax z18.s, p7/M, z18.s, z0.s\n"
+      "fmax z8.s, p7/M, z8.s, z0.s\n"
+      "fmax z9.s, p7/M, z9.s, z0.s\n"
+      "fmax z10.s, p7/M, z10.s, z0.s\n"
+      "fmax z11.s, p7/M, z11.s, z0.s\n"
+      "fmax z12.s, p7/M, z12.s, z0.s\n"
+      "fmax z13.s, p7/M, z13.s, z0.s\n"
+      "25:"  // Height 2: No activation
+      "st1w { z4.s }, p6, [x26]\n"
+      "st1w { z14.s }, p5, [x26, #1, MUL VL]\n"
+      "st1w { z15.s }, p4, [x26, #2, MUL VL]\n"
+      "st1w { z16.s }, p3, [x26, #3, MUL VL]\n"
+      "st1w { z17.s }, p2, [x26, #4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x26, #5, MUL VL]\n"
+      "addvl x26, x26, #6\n"
+      "st1w { z8.s }, p6, [x22]\n"
+      "st1w { z9.s }, p5, [x22, #1, MUL VL]\n"
+      "st1w { z10.s }, p4, [x22, #2, MUL VL]\n"
+      "st1w { z11.s }, p3, [x22, #3, MUL VL]\n"
+      "st1w { z12.s }, p2, [x22, #4, MUL VL]\n"
+      "st1w { z13.s }, p1, [x22, #5, MUL VL]\n"
+      "26:"  // Height 2: Writeback done
+      "decw x28, ALL, MUL #6\n"
+      "cmp x28, XZR\n"
+      "bgt 15b\n"
+      "b 54f\n"
+      "27:"  // Height 3
+      "mov x9, %x[bias]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "28:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p6.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p5.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p4.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x28\n"
+      "cbz x9, 29f\n"
+      "ld1w { z8.s }, p7/Z, [x9]\n"
+      "ld1w { z9.s }, p7/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p7/Z, [x9, #2, MUL VL]\n"
+      "zip2 z14.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z11.s }, p7/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p7/Z, [x9, #4, MUL VL]\n"
+      "zip2 z15.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "ld1w { z13.s }, p7/Z, [x9, #5, MUL VL]\n"
+      "zip2 z16.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x9, x9, #6\n"
+      "zip2 z17.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "zip2 z18.d, z12.d, z12.d\n"
+      "zip1 z12.d, z12.d, z12.d\n"
+      "zip2 z19.d, z13.d, z13.d\n"
+      "zip1 z13.d, z13.d, z13.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z26.d, z14.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z27.d, z15.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z28.d, z16.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z29.d, z17.d\n"
+      "mov z24.d, z12.d\n"
+      "mov z30.d, z18.d\n"
+      "mov z25.d, z13.d\n"
+      "mov z31.d, z19.d\n"
+      "b 31f\n"
+      "29:"  // Height 3: no bias
+      "tbz %x[flags], #0, 30f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x26, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z9.s }, p6/Z, [x26]\n"
+      "ld1w { z10.s }, p5/Z, [x26, #1, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x26, #3, MUL VL]\n"
+      "ld1w { z13.s }, p2/Z, [x26, #4, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x26, #5, MUL VL]\n"
+      "ld1w { z14.s }, p6/Z, [x22]\n"
+      "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n"
+      "zip1 z8.d, z9.d, z14.d\n"
+      "zip2 z14.d, z9.d, z14.d\n"
+      "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z15.d\n"
+      "zip2 z15.d, z10.d, z15.d\n"
+      "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n"
+      "zip1 z10.d, z11.d, z16.d\n"
+      "zip2 z16.d, z11.d, z16.d\n"
+      "ld1w { z21.s }, p6/Z, [x21]\n"
+      "ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n"
+      "zip1 z11.d, z12.d, z17.d\n"
+      "zip2 z17.d, z12.d, z17.d\n"
+      "ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n"
+      "zip1 z12.d, z13.d, z18.d\n"
+      "zip2 z18.d, z13.d, z18.d\n"
+      "ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n"
+      "ld1w { z4.s }, p1/Z, [x21, #5, MUL VL]\n"
+      "zip1 z13.d, z20.d, z19.d\n"
+      "zip2 z19.d, z20.d, z19.d\n"
+      "zip1 z20.d, z21.d, z26.d\n"
+      "zip2 z26.d, z21.d, z26.d\n"
+      "zip1 z21.d, z22.d, z27.d\n"
+      "zip2 z27.d, z22.d, z27.d\n"
+      "zip1 z22.d, z23.d, z28.d\n"
+      "zip2 z28.d, z23.d, z28.d\n"
+      "zip1 z23.d, z24.d, z29.d\n"
+      "zip2 z29.d, z24.d, z29.d\n"
+      "zip1 z24.d, z25.d, z30.d\n"
+      "zip2 z30.d, z25.d, z30.d\n"
+      "zip1 z25.d, z4.d, z31.d\n"
+      "zip2 z31.d, z4.d, z31.d\n"
+      "b 31f\n"
+      "30:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "31:"  // Height 3: setup done
+      "mov x25, #0x0\n"
+      "32:"  // Height 3: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w24, [x19, x25, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 33f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "cbnz x25, 34f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 34f\n"
+      "33:"  // Height 3: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "34:"  // Height 3: input setup done
+      "cmp x24, #0x4\n"
+      "ble 36f\n"
+      "35:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x24\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      "ld1rqw { z1.s }, p0/Z, [x22]\n"
+      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x21]\n"
+      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z4.h }, p7/Z, [x27]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      ".inst 0x658abc42  // bfcvt z2.h, p7/M, z2.s\n"
+      "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n"
+      "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
+      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n"
+      "sub x24, x24, #0x4\n"
+      ".inst 0x6465e45a  // bfmmla z26.s, z2.h, z5.h\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n"
+      "cmp x24, #0x4\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6467e45b  // bfmmla z27.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
+      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6465e45c  // bfmmla z28.s, z2.h, z5.h\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n"
+      ".inst 0x6467e45d  // bfmmla z29.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n"
+      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      "addvl x27, x27, #-4\n"
+      ".inst 0x6464e458  // bfmmla z24.s, z2.h, z4.h\n"
+      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
+      ".inst 0x6465e45e  // bfmmla z30.s, z2.h, z5.h\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6466e459  // bfmmla z25.s, z2.h, z6.h\n"
+      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      ".inst 0x6467e45f  // bfmmla z31.s, z2.h, z7.h\n"
+      "bgt 35b\n"
+      "36:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x24\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      "ld1rqw { z1.s }, p0/Z, [x22]\n"
+      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x21]\n"
+      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z4.h }, p7/Z, [x27]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      ".inst 0x658abc42  // bfcvt z2.h, p7/M, z2.s\n"
+      "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n"
+      "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
+      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n"
+      ".inst 0x6465e45a  // bfmmla z26.s, z2.h, z5.h\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n"
+      ".inst 0x6467e45b  // bfmmla z27.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
+      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n"
+      ".inst 0x6465e45c  // bfmmla z28.s, z2.h, z5.h\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n"
+      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n"
+      ".inst 0x6467e45d  // bfmmla z29.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n"
+      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      "addvl x27, x27, #-4\n"
+      ".inst 0x6464e458  // bfmmla z24.s, z2.h, z4.h\n"
+      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
+      ".inst 0x6465e45e  // bfmmla z30.s, z2.h, z5.h\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6466e459  // bfmmla z25.s, z2.h, z6.h\n"
+      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      ".inst 0x6467e45f  // bfmmla z31.s, z2.h, z7.h\n"
+      "37:"  // Height 3: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 32b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x26, x19, LSL #2\n"
+      "uzp1 z4.d, z8.d, z14.d\n"
+      "uzp2 z8.d, z8.d, z14.d\n"
+      "uzp1 z14.d, z9.d, z15.d\n"
+      "uzp2 z9.d, z9.d, z15.d\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 z15.d, z10.d, z16.d\n"
+      "uzp2 z10.d, z10.d, z16.d\n"
+      "uzp1 z16.d, z11.d, z17.d\n"
+      "uzp2 z11.d, z11.d, z17.d\n"
+      "uzp1 z17.d, z12.d, z18.d\n"
+      "uzp2 z12.d, z12.d, z18.d\n"
+      "uzp1 z18.d, z13.d, z19.d\n"
+      "uzp2 z13.d, z13.d, z19.d\n"
+      "uzp1 z20.d, z20.d, z26.d\n"
+      "uzp1 z21.d, z21.d, z27.d\n"
+      "uzp1 z22.d, z22.d, z28.d\n"
+      "uzp1 z23.d, z23.d, z29.d\n"
+      "uzp1 z24.d, z24.d, z30.d\n"
+      "uzp1 z25.d, z25.d, z31.d\n"
+      "tbz %x[flags], #1, 38f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p7/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p7/Z, [x19]\n"
+      "fmin z4.s, p7/M, z4.s, z1.s\n"
+      "fmin z14.s, p7/M, z14.s, z1.s\n"
+      "fmin z15.s, p7/M, z15.s, z1.s\n"
+      "fmin z16.s, p7/M, z16.s, z1.s\n"
+      "fmin z17.s, p7/M, z17.s, z1.s\n"
+      "fmin z18.s, p7/M, z18.s, z1.s\n"
+      "fmin z8.s, p7/M, z8.s, z1.s\n"
+      "fmin z9.s, p7/M, z9.s, z1.s\n"
+      "fmin z10.s, p7/M, z10.s, z1.s\n"
+      "fmin z11.s, p7/M, z11.s, z1.s\n"
+      "fmin z12.s, p7/M, z12.s, z1.s\n"
+      "fmin z13.s, p7/M, z13.s, z1.s\n"
+      "fmin z20.s, p7/M, z20.s, z1.s\n"
+      "fmin z21.s, p7/M, z21.s, z1.s\n"
+      "fmin z22.s, p7/M, z22.s, z1.s\n"
+      "fmin z23.s, p7/M, z23.s, z1.s\n"
+      "fmin z24.s, p7/M, z24.s, z1.s\n"
+      "fmin z25.s, p7/M, z25.s, z1.s\n"
+      "fmax z4.s, p7/M, z4.s, z0.s\n"
+      "fmax z14.s, p7/M, z14.s, z0.s\n"
+      "fmax z15.s, p7/M, z15.s, z0.s\n"
+      "fmax z16.s, p7/M, z16.s, z0.s\n"
+      "fmax z17.s, p7/M, z17.s, z0.s\n"
+      "fmax z18.s, p7/M, z18.s, z0.s\n"
+      "fmax z8.s, p7/M, z8.s, z0.s\n"
+      "fmax z9.s, p7/M, z9.s, z0.s\n"
+      "fmax z10.s, p7/M, z10.s, z0.s\n"
+      "fmax z11.s, p7/M, z11.s, z0.s\n"
+      "fmax z12.s, p7/M, z12.s, z0.s\n"
+      "fmax z13.s, p7/M, z13.s, z0.s\n"
+      "fmax z20.s, p7/M, z20.s, z0.s\n"
+      "fmax z21.s, p7/M, z21.s, z0.s\n"
+      "fmax z22.s, p7/M, z22.s, z0.s\n"
+      "fmax z23.s, p7/M, z23.s, z0.s\n"
+      "fmax z24.s, p7/M, z24.s, z0.s\n"
+      "fmax z25.s, p7/M, z25.s, z0.s\n"
+      "38:"  // Height 3: No activation
+      "st1w { z4.s }, p6, [x26]\n"
+      "st1w { z14.s }, p5, [x26, #1, MUL VL]\n"
+      "st1w { z15.s }, p4, [x26, #2, MUL VL]\n"
+      "st1w { z16.s }, p3, [x26, #3, MUL VL]\n"
+      "st1w { z17.s }, p2, [x26, #4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x26, #5, MUL VL]\n"
+      "addvl x26, x26, #6\n"
+      "st1w { z8.s }, p6, [x22]\n"
+      "st1w { z9.s }, p5, [x22, #1, MUL VL]\n"
+      "st1w { z10.s }, p4, [x22, #2, MUL VL]\n"
+      "st1w { z11.s }, p3, [x22, #3, MUL VL]\n"
+      "st1w { z12.s }, p2, [x22, #4, MUL VL]\n"
+      "st1w { z13.s }, p1, [x22, #5, MUL VL]\n"
+      "st1w { z20.s }, p6, [x21]\n"
+      "st1w { z21.s }, p5, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p4, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p3, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p2, [x21, #4, MUL VL]\n"
+      "st1w { z25.s }, p1, [x21, #5, MUL VL]\n"
+      "39:"  // Height 3: Writeback done
+      "decw x28, ALL, MUL #6\n"
+      "cmp x28, XZR\n"
+      "bgt 28b\n"
+      "b 54f\n"
+      "40:"  // Height 4
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x19, #0x10\n"
+      "mov x9, %x[bias]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x27, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "41:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p6.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p5.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p4.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x28\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x28\n"
+      "cbz x9, 42f\n"
+      "ld1w { z8.s }, p7/Z, [x9]\n"
+      "ld1w { z9.s }, p7/Z, [x9, #1, MUL VL]\n"
+      "ld1w { z10.s }, p7/Z, [x9, #2, MUL VL]\n"
+      "zip2 z14.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z11.s }, p7/Z, [x9, #3, MUL VL]\n"
+      "ld1w { z12.s }, p7/Z, [x9, #4, MUL VL]\n"
+      "zip2 z15.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "ld1w { z13.s }, p7/Z, [x9, #5, MUL VL]\n"
+      "zip2 z16.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "addvl x9, x9, #6\n"
+      "zip2 z17.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "zip2 z18.d, z12.d, z12.d\n"
+      "zip1 z12.d, z12.d, z12.d\n"
+      "zip2 z19.d, z13.d, z13.d\n"
+      "zip1 z13.d, z13.d, z13.d\n"
+      "mov z20.d, z8.d\n"
+      "mov z26.d, z14.d\n"
+      "mov z21.d, z9.d\n"
+      "mov z27.d, z15.d\n"
+      "mov z22.d, z10.d\n"
+      "mov z28.d, z16.d\n"
+      "mov z23.d, z11.d\n"
+      "mov z29.d, z17.d\n"
+      "mov z24.d, z12.d\n"
+      "mov z30.d, z18.d\n"
+      "mov z25.d, z13.d\n"
+      "mov z31.d, z19.d\n"
+      "b 44f\n"
+      "42:"  // Height 4: no bias
+      "tbz %x[flags], #0, 43f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x26, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "ld1w { z9.s }, p6/Z, [x26]\n"
+      "ld1w { z10.s }, p5/Z, [x26, #1, MUL VL]\n"
+      "ld1w { z11.s }, p4/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x26, #3, MUL VL]\n"
+      "ld1w { z13.s }, p2/Z, [x26, #4, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x26, #5, MUL VL]\n"
+      "ld1w { z14.s }, p6/Z, [x22]\n"
+      "ld1w { z15.s }, p5/Z, [x22, #1, MUL VL]\n"
+      "zip1 z8.d, z9.d, z14.d\n"
+      "zip2 z14.d, z9.d, z14.d\n"
+      "ld1w { z16.s }, p4/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z17.s }, p3/Z, [x22, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z15.d\n"
+      "zip2 z15.d, z10.d, z15.d\n"
+      "ld1w { z18.s }, p2/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x22, #5, MUL VL]\n"
+      "zip1 z10.d, z11.d, z16.d\n"
+      "zip2 z16.d, z11.d, z16.d\n"
+      "ld1w { z21.s }, p6/Z, [x21]\n"
+      "ld1w { z22.s }, p5/Z, [x21, #1, MUL VL]\n"
+      "zip1 z11.d, z12.d, z17.d\n"
+      "zip2 z17.d, z12.d, z17.d\n"
+      "ld1w { z23.s }, p4/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x21, #3, MUL VL]\n"
+      "zip1 z12.d, z13.d, z18.d\n"
+      "zip2 z18.d, z13.d, z18.d\n"
+      "ld1w { z25.s }, p2/Z, [x21, #4, MUL VL]\n"
+      "ld1w { z4.s }, p1/Z, [x21, #5, MUL VL]\n"
+      "zip1 z13.d, z20.d, z19.d\n"
+      "zip2 z19.d, z20.d, z19.d\n"
+      "ld1w { z26.s }, p6/Z, [x20]\n"
+      "ld1w { z27.s }, p5/Z, [x20, #1, MUL VL]\n"
+      "zip1 z20.d, z21.d, z26.d\n"
+      "zip2 z26.d, z21.d, z26.d\n"
+      "ld1w { z28.s }, p4/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z29.s }, p3/Z, [x20, #3, MUL VL]\n"
+      "zip1 z21.d, z22.d, z27.d\n"
+      "zip2 z27.d, z22.d, z27.d\n"
+      "ld1w { z30.s }, p2/Z, [x20, #4, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #5, MUL VL]\n"
+      "zip1 z22.d, z23.d, z28.d\n"
+      "zip2 z28.d, z23.d, z28.d\n"
+      "zip1 z23.d, z24.d, z29.d\n"
+      "zip2 z29.d, z24.d, z29.d\n"
+      "zip1 z24.d, z25.d, z30.d\n"
+      "zip2 z30.d, z25.d, z30.d\n"
+      "zip1 z25.d, z4.d, z31.d\n"
+      "zip2 z31.d, z4.d, z31.d\n"
+      "b 44f\n"
+      "43:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "44:"  // Height 4: setup done
+      "mov x25, #0x0\n"
+      "45:"  // Height 4: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w24, [x19, x25, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 46f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x25, 47f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 47f\n"
+      "46:"  // Height 4: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "47:"  // Height 4: input setup done
+      "cmp x24, #0x4\n"
+      "ble 49f\n"
+      "48:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x24\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      "ld1rqw { z1.s }, p0/Z, [x22]\n"
+      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x21]\n"
+      "ld1rqw { z3.s }, p0/Z, [x20]\n"
+      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
+      ".inst 0x658abc42  // bfcvt z2.h, p7/M, z2.s\n"
+      ".inst 0x658abc63  // bfcvt z3.h, p7/M, z3.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z4.h }, p7/Z, [x27]\n"
+      "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      "sub x24, x24, #0x4\n"
+      "trn1 z2.d, z2.d, z3.d\n"
+      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
+      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n"
+      ".inst 0x6465e45a  // bfmmla z26.s, z2.h, z5.h\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n"
+      "cmp x24, #0x4\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6467e45b  // bfmmla z27.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
+      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6465e45c  // bfmmla z28.s, z2.h, z5.h\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6467e45d  // bfmmla z29.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n"
+      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      "addvl x27, x27, #-4\n"
+      ".inst 0x6464e458  // bfmmla z24.s, z2.h, z4.h\n"
+      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
+      ".inst 0x6465e45e  // bfmmla z30.s, z2.h, z5.h\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6466e459  // bfmmla z25.s, z2.h, z6.h\n"
+      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      ".inst 0x6467e45f  // bfmmla z31.s, z2.h, z7.h\n"
+      "bgt 48b\n"
+      "49:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x24\n"
+      "ld1rqw { z0.s }, p0/Z, [x23]\n"
+      "ld1rqw { z1.s }, p0/Z, [x22]\n"
+      ".inst 0x658abc00  // bfcvt z0.h, p7/M, z0.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x21]\n"
+      "ld1rqw { z3.s }, p0/Z, [x20]\n"
+      ".inst 0x658abc21  // bfcvt z1.h, p7/M, z1.s\n"
+      ".inst 0x658abc42  // bfcvt z2.h, p7/M, z2.s\n"
+      ".inst 0x658abc63  // bfcvt z3.h, p7/M, z3.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z4.h }, p7/Z, [x27]\n"
+      "ld1h { z5.h }, p7/Z, [x27, #1, MUL VL]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z7.h }, p7/Z, [x27, #3, MUL VL]\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      "trn1 z2.d, z2.d, z3.d\n"
+      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
+      ".inst 0x6465e40e  // bfmmla z14.s, z0.h, z5.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #4, MUL VL]\n"
+      ".inst 0x6465e45a  // bfmmla z26.s, z2.h, z5.h\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      "ld1h { z5.h }, p7/Z, [x27, #5, MUL VL]\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #6, MUL VL]\n"
+      ".inst 0x6467e45b  // bfmmla z27.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p7/Z, [x27, #7, MUL VL]\n"
+      "addvl x27, x27, #16\n"
+      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
+      ".inst 0x6465e410  // bfmmla z16.s, z0.h, z5.h\n"
+      "ld1h { z4.h }, p7/Z, [x27, #-8, MUL VL]\n"
+      ".inst 0x6465e45c  // bfmmla z28.s, z2.h, z5.h\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "ld1h { z5.h }, p7/Z, [x27, #-7, MUL VL]\n"
+      ".inst 0x6466e457  // bfmmla z23.s, z2.h, z6.h\n"
+      ".inst 0x6467e411  // bfmmla z17.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p7/Z, [x27, #-6, MUL VL]\n"
+      ".inst 0x6467e45d  // bfmmla z29.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p7/Z, [x27, #-5, MUL VL]\n"
+      ".inst 0x6464e40c  // bfmmla z12.s, z0.h, z4.h\n"
+      "addvl x27, x27, #-4\n"
+      ".inst 0x6464e458  // bfmmla z24.s, z2.h, z4.h\n"
+      ".inst 0x6465e412  // bfmmla z18.s, z0.h, z5.h\n"
+      ".inst 0x6465e45e  // bfmmla z30.s, z2.h, z5.h\n"
+      ".inst 0x6466e40d  // bfmmla z13.s, z0.h, z6.h\n"
+      ".inst 0x6466e459  // bfmmla z25.s, z2.h, z6.h\n"
+      ".inst 0x6467e413  // bfmmla z19.s, z0.h, z7.h\n"
+      ".inst 0x6467e45f  // bfmmla z31.s, z2.h, z7.h\n"
+      "50:"  // Height 4: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 45b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x22, x26, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp1 z4.d, z8.d, z14.d\n"
+      "uzp2 z8.d, z8.d, z14.d\n"
+      "uzp1 z14.d, z9.d, z15.d\n"
+      "add x20, x21, x19, LSL #2\n"
+      "uzp2 z9.d, z9.d, z15.d\n"
+      "uzp1 z15.d, z10.d, z16.d\n"
+      "uzp2 z10.d, z10.d, z16.d\n"
+      "uzp1 z16.d, z11.d, z17.d\n"
+      "uzp2 z11.d, z11.d, z17.d\n"
+      "uzp1 z17.d, z12.d, z18.d\n"
+      "uzp2 z12.d, z12.d, z18.d\n"
+      "uzp1 z18.d, z13.d, z19.d\n"
+      "uzp2 z13.d, z13.d, z19.d\n"
+      "uzp1 z19.d, z20.d, z26.d\n"
+      "uzp2 z20.d, z20.d, z26.d\n"
+      "uzp1 z26.d, z21.d, z27.d\n"
+      "uzp2 z21.d, z21.d, z27.d\n"
+      "uzp1 z27.d, z22.d, z28.d\n"
+      "uzp2 z22.d, z22.d, z28.d\n"
+      "uzp1 z28.d, z23.d, z29.d\n"
+      "uzp2 z23.d, z23.d, z29.d\n"
+      "uzp1 z29.d, z24.d, z30.d\n"
+      "uzp2 z24.d, z24.d, z30.d\n"
+      "uzp1 z30.d, z25.d, z31.d\n"
+      "uzp2 z25.d, z25.d, z31.d\n"
+      "tbz %x[flags], #1, 51f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p7/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p7/Z, [x19]\n"
+      "fmin z4.s, p7/M, z4.s, z1.s\n"
+      "fmin z14.s, p7/M, z14.s, z1.s\n"
+      "fmin z15.s, p7/M, z15.s, z1.s\n"
+      "fmin z16.s, p7/M, z16.s, z1.s\n"
+      "fmin z17.s, p7/M, z17.s, z1.s\n"
+      "fmin z18.s, p7/M, z18.s, z1.s\n"
+      "fmin z8.s, p7/M, z8.s, z1.s\n"
+      "fmin z9.s, p7/M, z9.s, z1.s\n"
+      "fmin z10.s, p7/M, z10.s, z1.s\n"
+      "fmin z11.s, p7/M, z11.s, z1.s\n"
+      "fmin z12.s, p7/M, z12.s, z1.s\n"
+      "fmin z13.s, p7/M, z13.s, z1.s\n"
+      "fmin z19.s, p7/M, z19.s, z1.s\n"
+      "fmin z26.s, p7/M, z26.s, z1.s\n"
+      "fmin z27.s, p7/M, z27.s, z1.s\n"
+      "fmin z28.s, p7/M, z28.s, z1.s\n"
+      "fmin z29.s, p7/M, z29.s, z1.s\n"
+      "fmin z30.s, p7/M, z30.s, z1.s\n"
+      "fmin z20.s, p7/M, z20.s, z1.s\n"
+      "fmin z21.s, p7/M, z21.s, z1.s\n"
+      "fmin z22.s, p7/M, z22.s, z1.s\n"
+      "fmin z23.s, p7/M, z23.s, z1.s\n"
+      "fmin z24.s, p7/M, z24.s, z1.s\n"
+      "fmin z25.s, p7/M, z25.s, z1.s\n"
+      "fmax z4.s, p7/M, z4.s, z0.s\n"
+      "fmax z14.s, p7/M, z14.s, z0.s\n"
+      "fmax z15.s, p7/M, z15.s, z0.s\n"
+      "fmax z16.s, p7/M, z16.s, z0.s\n"
+      "fmax z17.s, p7/M, z17.s, z0.s\n"
+      "fmax z18.s, p7/M, z18.s, z0.s\n"
+      "fmax z8.s, p7/M, z8.s, z0.s\n"
+      "fmax z9.s, p7/M, z9.s, z0.s\n"
+      "fmax z10.s, p7/M, z10.s, z0.s\n"
+      "fmax z11.s, p7/M, z11.s, z0.s\n"
+      "fmax z12.s, p7/M, z12.s, z0.s\n"
+      "fmax z13.s, p7/M, z13.s, z0.s\n"
+      "fmax z19.s, p7/M, z19.s, z0.s\n"
+      "fmax z26.s, p7/M, z26.s, z0.s\n"
+      "fmax z27.s, p7/M, z27.s, z0.s\n"
+      "fmax z28.s, p7/M, z28.s, z0.s\n"
+      "fmax z29.s, p7/M, z29.s, z0.s\n"
+      "fmax z30.s, p7/M, z30.s, z0.s\n"
+      "fmax z20.s, p7/M, z20.s, z0.s\n"
+      "fmax z21.s, p7/M, z21.s, z0.s\n"
+      "fmax z22.s, p7/M, z22.s, z0.s\n"
+      "fmax z23.s, p7/M, z23.s, z0.s\n"
+      "fmax z24.s, p7/M, z24.s, z0.s\n"
+      "fmax z25.s, p7/M, z25.s, z0.s\n"
+      "51:"  // Height 4: No activation
+      "st1w { z4.s }, p6, [x26]\n"
+      "st1w { z14.s }, p5, [x26, #1, MUL VL]\n"
+      "st1w { z15.s }, p4, [x26, #2, MUL VL]\n"
+      "st1w { z16.s }, p3, [x26, #3, MUL VL]\n"
+      "st1w { z17.s }, p2, [x26, #4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x26, #5, MUL VL]\n"
+      "addvl x26, x26, #6\n"
+      "st1w { z8.s }, p6, [x22]\n"
+      "st1w { z9.s }, p5, [x22, #1, MUL VL]\n"
+      "st1w { z10.s }, p4, [x22, #2, MUL VL]\n"
+      "st1w { z11.s }, p3, [x22, #3, MUL VL]\n"
+      "st1w { z12.s }, p2, [x22, #4, MUL VL]\n"
+      "st1w { z13.s }, p1, [x22, #5, MUL VL]\n"
+      "st1w { z19.s }, p6, [x21]\n"
+      "st1w { z26.s }, p5, [x21, #1, MUL VL]\n"
+      "st1w { z27.s }, p4, [x21, #2, MUL VL]\n"
+      "st1w { z28.s }, p3, [x21, #3, MUL VL]\n"
+      "st1w { z29.s }, p2, [x21, #4, MUL VL]\n"
+      "st1w { z30.s }, p1, [x21, #5, MUL VL]\n"
+      "st1w { z20.s }, p6, [x20]\n"
+      "st1w { z21.s }, p5, [x20, #1, MUL VL]\n"
+      "st1w { z22.s }, p4, [x20, #2, MUL VL]\n"
+      "st1w { z23.s }, p3, [x20, #3, MUL VL]\n"
+      "st1w { z24.s }, p2, [x20, #4, MUL VL]\n"
+      "st1w { z25.s }, p1, [x20, #5, MUL VL]\n"
+      "52:"  // Height 4: Writeback done
+      "decw x28, ALL, MUL #6\n"
+      "cmp x28, XZR\n"
+      "bgt 41b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 54f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 53f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "53:"  // Update direct input
+      "mov x19, #0x10\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "54:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
new file mode 100644
index 0000000000..d941ccc0e9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL.hpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<float>, \
+    size_t, size_t, \
+    const bfloat16 *, \
+    IndirectOutputArg<float>, \
+    const float *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_fp32bf16fp32_mmla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL
+{
+public:
+    typedef float lhs_operand_type;
+    typedef bfloat16 rhs_operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 14.06 };
+                case CPUModel::A510:
+                    return { 5.31 };
+                case CPUModel::V1:
+                    return { 26.64 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_fp32bf16fp32_mmla_6x4VL;
+    cls_sve_hybrid_fp32bf16fp32_mmla_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..236eebad66
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32bf16fp32_mmla_6x4VL/generic.cpp
@@ -0,0 +1,1793 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+#include <limits>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32bf16fp32_mmla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+    size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+    const float *bias, Activation act, bool accumulate
+)
+{
+    struct KernelArgs {
+        float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+        float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const bfloat16 *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    switch(act.type) {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            ka.maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            ka.minval = 0;
+            flags |= 0x2;
+            break;
+    }
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 66f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 53f\n"
+      "beq 40f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 27f\n"
+      "beq 14f\n"
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "cbz x11, 3f\n"
+      "ld1w { z8.s }, p5/Z, [x11]\n"
+      "ld1w { z9.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "addvl x11, x11, #4\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "b 5f\n"
+      "3:"  // Height 1: no bias
+      "tbz %x[flags], #0, 4f\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 5f\n"
+      "4:"  // Height 1: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "5:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "6:"  // Height 1: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 7f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 8f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "b 8f\n"
+      "7:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "8:"  // Height 1: input setup done
+      "cmp x26, #0x4\n"
+      "ble 10f\n"
+      "9:"  // Height 1: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x26\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "sub x26, x26, #0x4\n"
+      "cmp x26, #0x4\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      "add x25, x25, #0x10\n"
+      "addvl x9, x9, #8\n"
+      "bgt 9b\n"
+      "10:"  // Height 1: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x26\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      "addvl x9, x9, #8\n"
+      "11:"  // Height 1: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 6b\n"
+      "uzp1 z8.d, z8.d, z12.d\n"
+      "uzp1 z9.d, z9.d, z13.d\n"
+      "uzp1 z10.d, z10.d, z14.d\n"
+      "uzp1 z11.d, z11.d, z15.d\n"
+      "tbz %x[flags], #1, 12f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z1.s\n"
+      "fmin z10.s, p5/M, z10.s, z1.s\n"
+      "fmin z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z0.s\n"
+      "fmax z10.s, p5/M, z10.s, z0.s\n"
+      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "12:"  // Height 1: No activation
+      "st1w { z8.s }, p4, [x28]\n"
+      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "13:"  // Height 1: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 2b\n"
+      "b 80f\n"
+      "14:"  // Height 2
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "15:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "cbz x11, 16f\n"
+      "ld1w { z8.s }, p5/Z, [x11]\n"
+      "ld1w { z9.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "addvl x11, x11, #4\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "b 18f\n"
+      "16:"  // Height 2: no bias
+      "tbz %x[flags], #0, 17f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 18f\n"
+      "17:"  // Height 2: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "18:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "19:"  // Height 2: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 20f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 21f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "b 21f\n"
+      "20:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "21:"  // Height 2: input setup done
+      "cmp x26, #0x4\n"
+      "ble 23f\n"
+      "22:"  // Height 2: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x26\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
+      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "sub x26, x26, #0x4\n"
+      "cmp x26, #0x4\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "addvl x9, x9, #8\n"
+      "bgt 22b\n"
+      "23:"  // Height 2: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x26\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
+      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      "24:"  // Height 2: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 19b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp1 z6.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "add x24, x28, x19, LSL #2\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "tbz %x[flags], #1, 25f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z6.s, p5/M, z6.s, z1.s\n"
+      "fmin z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z1.s\n"
+      "fmin z14.s, p5/M, z14.s, z1.s\n"
+      "fmin z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z1.s\n"
+      "fmin z10.s, p5/M, z10.s, z1.s\n"
+      "fmin z11.s, p5/M, z11.s, z1.s\n"
+      "fmax z6.s, p5/M, z6.s, z0.s\n"
+      "fmax z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z0.s\n"
+      "fmax z14.s, p5/M, z14.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z0.s\n"
+      "fmax z10.s, p5/M, z10.s, z0.s\n"
+      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "25:"  // Height 2: No activation
+      "st1w { z6.s }, p4, [x28]\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "26:"  // Height 2: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 15b\n"
+      "b 80f\n"
+      "27:"  // Height 3
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "28:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "cbz x11, 29f\n"
+      "ld1w { z8.s }, p5/Z, [x11]\n"
+      "ld1w { z9.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "addvl x11, x11, #4\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "b 31f\n"
+      "29:"  // Height 3: no bias
+      "tbz %x[flags], #0, 30f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 31f\n"
+      "30:"  // Height 3: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "31:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "32:"  // Height 3: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 33f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 34f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "b 34f\n"
+      "33:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "34:"  // Height 3: input setup done
+      "cmp x26, #0x4\n"
+      "ble 36f\n"
+      "35:"  // Height 3: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x26\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x23]\n"
+      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "sub x26, x26, #0x4\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "cmp x26, #0x4\n"
+      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
+      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      "bgt 35b\n"
+      "36:"  // Height 3: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x26\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x23]\n"
+      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
+      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      "37:"  // Height 3: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 32b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "uzp1 z6.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "add x23, x24, x19, LSL #2\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "tbz %x[flags], #1, 38f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z6.s, p5/M, z6.s, z1.s\n"
+      "fmin z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z1.s\n"
+      "fmin z14.s, p5/M, z14.s, z1.s\n"
+      "fmin z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z1.s\n"
+      "fmin z10.s, p5/M, z10.s, z1.s\n"
+      "fmin z11.s, p5/M, z11.s, z1.s\n"
+      "fmin z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z1.s\n"
+      "fmin z18.s, p5/M, z18.s, z1.s\n"
+      "fmin z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z6.s, p5/M, z6.s, z0.s\n"
+      "fmax z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z0.s\n"
+      "fmax z14.s, p5/M, z14.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z0.s\n"
+      "fmax z10.s, p5/M, z10.s, z0.s\n"
+      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z0.s\n"
+      "fmax z18.s, p5/M, z18.s, z0.s\n"
+      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "38:"  // Height 3: No activation
+      "st1w { z6.s }, p4, [x28]\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x23]\n"
+      "st1w { z17.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x23, #3, MUL VL]\n"
+      "39:"  // Height 3: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 28b\n"
+      "b 80f\n"
+      "40:"  // Height 4
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "41:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "cbz x11, 42f\n"
+      "ld1w { z8.s }, p5/Z, [x11]\n"
+      "ld1w { z9.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "addvl x11, x11, #4\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "b 44f\n"
+      "42:"  // Height 4: no bias
+      "tbz %x[flags], #0, 43f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 44f\n"
+      "43:"  // Height 4: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "44:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "45:"  // Height 4: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 46f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 47f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "b 47f\n"
+      "46:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "47:"  // Height 4: input setup done
+      "cmp x26, #0x4\n"
+      "ble 49f\n"
+      "48:"  // Height 4: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x26\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x23]\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "sub x26, x26, #0x4\n"
+      "cmp x26, #0x4\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "add x25, x25, #0x10\n"
+      "trn1 z2.d, z2.d, z3.d\n"
+      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
+      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      "bgt 48b\n"
+      "49:"  // Height 4: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x26\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x23]\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "trn1 z2.d, z2.d, z3.d\n"
+      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
+      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      "50:"  // Height 4: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 45b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "uzp1 z6.d, z8.d, z12.d\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "tbz %x[flags], #1, 51f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z6.s, p5/M, z6.s, z1.s\n"
+      "fmin z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z1.s\n"
+      "fmin z14.s, p5/M, z14.s, z1.s\n"
+      "fmin z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z1.s\n"
+      "fmin z10.s, p5/M, z10.s, z1.s\n"
+      "fmin z11.s, p5/M, z11.s, z1.s\n"
+      "fmin z15.s, p5/M, z15.s, z1.s\n"
+      "fmin z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z1.s\n"
+      "fmin z22.s, p5/M, z22.s, z1.s\n"
+      "fmin z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z1.s\n"
+      "fmin z18.s, p5/M, z18.s, z1.s\n"
+      "fmin z19.s, p5/M, z19.s, z1.s\n"
+      "fmax z6.s, p5/M, z6.s, z0.s\n"
+      "fmax z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z0.s\n"
+      "fmax z14.s, p5/M, z14.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z0.s\n"
+      "fmax z10.s, p5/M, z10.s, z0.s\n"
+      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z0.s\n"
+      "fmax z22.s, p5/M, z22.s, z0.s\n"
+      "fmax z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z0.s\n"
+      "fmax z18.s, p5/M, z18.s, z0.s\n"
+      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "51:"  // Height 4: No activation
+      "st1w { z6.s }, p4, [x28]\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x23]\n"
+      "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x22]\n"
+      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "52:"  // Height 4: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 41b\n"
+      "b 80f\n"
+      "53:"  // Height 5
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "54:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "cbz x11, 55f\n"
+      "ld1w { z8.s }, p5/Z, [x11]\n"
+      "ld1w { z9.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "addvl x11, x11, #4\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z28.d, z12.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z29.d, z13.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z30.d, z14.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z31.d, z15.d\n"
+      "b 57f\n"
+      "55:"  // Height 5: no bias
+      "tbz %x[flags], #0, 56f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z25.s }, p4/Z, [x21]\n"
+      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z6.d, z31.d\n"
+      "zip2 z31.d, z6.d, z31.d\n"
+      "b 57f\n"
+      "56:"  // Height 5: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "57:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "58:"  // Height 5: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 59f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 60f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "b 60f\n"
+      "59:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "60:"  // Height 5: input setup done
+      "cmp x26, #0x4\n"
+      "ble 62f\n"
+      "61:"  // Height 5: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x26\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x23]\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x21]\n"
+      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "sub x26, x26, #0x4\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      "cmp x26, #0x4\n"
+      "add x25, x25, #0x10\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      "trn1 z2.d, z2.d, z3.d\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "add x24, x24, #0x10\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
+      ".inst 0x6466e498  // bfmmla z24.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
+      "add x23, x23, #0x10\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6467e49c  // bfmmla z28.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
+      ".inst 0x6466e499  // bfmmla z25.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
+      ".inst 0x6467e49d  // bfmmla z29.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
+      ".inst 0x6466e49a  // bfmmla z26.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
+      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
+      ".inst 0x6467e49e  // bfmmla z30.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
+      ".inst 0x6466e49b  // bfmmla z27.s, z4.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      ".inst 0x6467e49f  // bfmmla z31.s, z4.h, z7.h\n"
+      "bgt 61b\n"
+      "62:"  // Height 5: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x26\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x23]\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x21]\n"
+      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      "trn1 z2.d, z2.d, z3.d\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
+      ".inst 0x6466e498  // bfmmla z24.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
+      ".inst 0x6467e49c  // bfmmla z28.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
+      ".inst 0x6466e499  // bfmmla z25.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
+      ".inst 0x6467e49d  // bfmmla z29.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
+      ".inst 0x6466e49a  // bfmmla z26.s, z4.h, z6.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
+      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
+      ".inst 0x6467e49e  // bfmmla z30.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
+      ".inst 0x6466e49b  // bfmmla z27.s, z4.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      ".inst 0x6467e49f  // bfmmla z31.s, z4.h, z7.h\n"
+      "63:"  // Height 5: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 58b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "uzp1 z6.d, z8.d, z12.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "tbz %x[flags], #1, 64f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z6.s, p5/M, z6.s, z1.s\n"
+      "fmin z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z1.s\n"
+      "fmin z14.s, p5/M, z14.s, z1.s\n"
+      "fmin z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z1.s\n"
+      "fmin z10.s, p5/M, z10.s, z1.s\n"
+      "fmin z11.s, p5/M, z11.s, z1.s\n"
+      "fmin z15.s, p5/M, z15.s, z1.s\n"
+      "fmin z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z1.s\n"
+      "fmin z22.s, p5/M, z22.s, z1.s\n"
+      "fmin z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z1.s\n"
+      "fmin z18.s, p5/M, z18.s, z1.s\n"
+      "fmin z19.s, p5/M, z19.s, z1.s\n"
+      "fmin z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z1.s\n"
+      "fmin z26.s, p5/M, z26.s, z1.s\n"
+      "fmin z27.s, p5/M, z27.s, z1.s\n"
+      "fmax z6.s, p5/M, z6.s, z0.s\n"
+      "fmax z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z0.s\n"
+      "fmax z14.s, p5/M, z14.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z0.s\n"
+      "fmax z10.s, p5/M, z10.s, z0.s\n"
+      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z0.s\n"
+      "fmax z22.s, p5/M, z22.s, z0.s\n"
+      "fmax z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z0.s\n"
+      "fmax z18.s, p5/M, z18.s, z0.s\n"
+      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z0.s\n"
+      "fmax z26.s, p5/M, z26.s, z0.s\n"
+      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "64:"  // Height 5: No activation
+      "st1w { z6.s }, p4, [x28]\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x23]\n"
+      "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x22]\n"
+      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x21]\n"
+      "st1w { z25.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x21, #3, MUL VL]\n"
+      "65:"  // Height 5: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 54b\n"
+      "b 80f\n"
+      "66:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x19, #0x18\n"
+      "mov x11, %x[bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "67:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "cbz x11, 68f\n"
+      "ld1w { z8.s }, p5/Z, [x11]\n"
+      "ld1w { z9.s }, p5/Z, [x11, #1, MUL VL]\n"
+      "ld1w { z10.s }, p5/Z, [x11, #2, MUL VL]\n"
+      "zip2 z12.d, z8.d, z8.d\n"
+      "zip1 z8.d, z8.d, z8.d\n"
+      "ld1w { z11.s }, p5/Z, [x11, #3, MUL VL]\n"
+      "zip2 z13.d, z9.d, z9.d\n"
+      "zip1 z9.d, z9.d, z9.d\n"
+      "addvl x11, x11, #4\n"
+      "zip2 z14.d, z10.d, z10.d\n"
+      "zip1 z10.d, z10.d, z10.d\n"
+      "zip2 z15.d, z11.d, z11.d\n"
+      "zip1 z11.d, z11.d, z11.d\n"
+      "mov z16.d, z8.d\n"
+      "mov z20.d, z12.d\n"
+      "mov z17.d, z9.d\n"
+      "mov z21.d, z13.d\n"
+      "mov z18.d, z10.d\n"
+      "mov z22.d, z14.d\n"
+      "mov z19.d, z11.d\n"
+      "mov z23.d, z15.d\n"
+      "mov z24.d, z8.d\n"
+      "mov z28.d, z12.d\n"
+      "mov z25.d, z9.d\n"
+      "mov z29.d, z13.d\n"
+      "mov z26.d, z10.d\n"
+      "mov z30.d, z14.d\n"
+      "mov z27.d, z11.d\n"
+      "mov z31.d, z15.d\n"
+      "b 70f\n"
+      "68:"  // Height 6: no bias
+      "tbz %x[flags], #0, 69f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x24]\n"
+      "ld1w { z13.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x24, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x23]\n"
+      "ld1w { z18.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z20.s }, p4/Z, [x22]\n"
+      "ld1w { z21.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z22.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z23.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z25.s }, p4/Z, [x21]\n"
+      "ld1w { z26.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "ld1w { z27.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z6.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "ld1w { z28.s }, p4/Z, [x20]\n"
+      "ld1w { z29.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "ld1w { z30.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z6.d, z31.d\n"
+      "zip2 z31.d, z6.d, z31.d\n"
+      "b 70f\n"
+      "69:"  // Height 6: no accumulate
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "70:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "71:"  // Height 6: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 72f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 73f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19, LSL #2\n"
+      "add x24, x24, x19, LSL #2\n"
+      "add x23, x23, x19, LSL #2\n"
+      "add x22, x22, x19, LSL #2\n"
+      "add x21, x21, x19, LSL #2\n"
+      "add x20, x20, x19, LSL #2\n"
+      "b 73f\n"
+      "72:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "73:"  // Height 6: input setup done
+      "cmp x26, #0x4\n"
+      "ble 75f\n"
+      "74:"  // Height 6: Multiply loop: Main loop head
+      "whilelt p0.s, XZR, x26\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x23]\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x21]\n"
+      "ld1rqw { z5.s }, p0/Z, [x20]\n"
+      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
+      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      ".inst 0x658ab4a5  // bfcvt z5.h, p5/M, z5.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "sub x26, x26, #0x4\n"
+      "cmp x26, #0x4\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "add x23, x23, #0x10\n"
+      "trn1 z2.d, z2.d, z3.d\n"
+      "trn1 z4.d, z4.d, z5.d\n"
+      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
+      "add x22, x22, #0x10\n"
+      ".inst 0x6466e498  // bfmmla z24.s, z4.h, z6.h\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
+      ".inst 0x6467e49c  // bfmmla z28.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
+      ".inst 0x6466e499  // bfmmla z25.s, z4.h, z6.h\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
+      ".inst 0x6467e49d  // bfmmla z29.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
+      ".inst 0x6466e49a  // bfmmla z26.s, z4.h, z6.h\n"
+      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
+      ".inst 0x6467e49e  // bfmmla z30.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
+      ".inst 0x6466e49b  // bfmmla z27.s, z4.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      ".inst 0x6467e49f  // bfmmla z31.s, z4.h, z7.h\n"
+      "bgt 74b\n"
+      "75:"  // Height 6: Multiply loop: Single iteration only
+      "whilelt p0.s, XZR, x26\n"
+      "ld1rqw { z0.s }, p0/Z, [x25]\n"
+      "ld1rqw { z1.s }, p0/Z, [x24]\n"
+      ".inst 0x658ab400  // bfcvt z0.h, p5/M, z0.s\n"
+      "ld1rqw { z2.s }, p0/Z, [x23]\n"
+      "ld1rqw { z3.s }, p0/Z, [x22]\n"
+      ".inst 0x658ab421  // bfcvt z1.h, p5/M, z1.s\n"
+      ".inst 0x658ab442  // bfcvt z2.h, p5/M, z2.s\n"
+      "ld1rqw { z4.s }, p0/Z, [x21]\n"
+      "ld1rqw { z5.s }, p0/Z, [x20]\n"
+      ".inst 0x658ab463  // bfcvt z3.h, p5/M, z3.s\n"
+      ".inst 0x658ab484  // bfcvt z4.h, p5/M, z4.s\n"
+      ".inst 0x658ab4a5  // bfcvt z5.h, p5/M, z5.s\n"
+      "uzp1 z0.h, z0.h, z0.h\n"
+      "ld1h { z6.h }, p5/Z, [x9]\n"
+      "ld1h { z7.h }, p5/Z, [x9, #1, MUL VL]\n"
+      "uzp1 z1.h, z1.h, z1.h\n"
+      "uzp1 z2.h, z2.h, z2.h\n"
+      "uzp1 z3.h, z3.h, z3.h\n"
+      "uzp1 z4.h, z4.h, z4.h\n"
+      "uzp1 z5.h, z5.h, z5.h\n"
+      "trn1 z0.d, z0.d, z1.d\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "trn1 z2.d, z2.d, z3.d\n"
+      "trn1 z4.d, z4.d, z5.d\n"
+      ".inst 0x6466e450  // bfmmla z16.s, z2.h, z6.h\n"
+      ".inst 0x6466e498  // bfmmla z24.s, z4.h, z6.h\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x6467e454  // bfmmla z20.s, z2.h, z7.h\n"
+      ".inst 0x6467e49c  // bfmmla z28.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6466e451  // bfmmla z17.s, z2.h, z6.h\n"
+      ".inst 0x6466e499  // bfmmla z25.s, z4.h, z6.h\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x6467e455  // bfmmla z21.s, z2.h, z7.h\n"
+      ".inst 0x6467e49d  // bfmmla z29.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6466e452  // bfmmla z18.s, z2.h, z6.h\n"
+      ".inst 0x6466e49a  // bfmmla z26.s, z4.h, z6.h\n"
+      ".inst 0x6467e40e  // bfmmla z14.s, z0.h, z7.h\n"
+      "ld1h { z6.h }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x6467e456  // bfmmla z22.s, z2.h, z7.h\n"
+      ".inst 0x6467e49e  // bfmmla z30.s, z4.h, z7.h\n"
+      "ld1h { z7.h }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x6466e40b  // bfmmla z11.s, z0.h, z6.h\n"
+      ".inst 0x6466e453  // bfmmla z19.s, z2.h, z6.h\n"
+      ".inst 0x6466e49b  // bfmmla z27.s, z4.h, z6.h\n"
+      ".inst 0x6467e40f  // bfmmla z15.s, z0.h, z7.h\n"
+      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      ".inst 0x6467e49f  // bfmmla z31.s, z4.h, z7.h\n"
+      "76:"  // Height 6: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 71b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x24, x28, x19, LSL #2\n"
+      "add x23, x24, x19, LSL #2\n"
+      "uzp1 z6.d, z8.d, z12.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x20, x21, x19, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "tbz %x[flags], #1, 77f\n"
+      "add x19, %x[args_ptr], %[offset_max]\n"
+      "ld1rw { z1.s }, p5/Z, [x19]\n"
+      "add x19, %x[args_ptr], %[offset_min]\n"
+      "ld1rw { z0.s }, p5/Z, [x19]\n"
+      "fmin z6.s, p5/M, z6.s, z1.s\n"
+      "fmin z12.s, p5/M, z12.s, z1.s\n"
+      "fmin z13.s, p5/M, z13.s, z1.s\n"
+      "fmin z14.s, p5/M, z14.s, z1.s\n"
+      "fmin z8.s, p5/M, z8.s, z1.s\n"
+      "fmin z9.s, p5/M, z9.s, z1.s\n"
+      "fmin z10.s, p5/M, z10.s, z1.s\n"
+      "fmin z11.s, p5/M, z11.s, z1.s\n"
+      "fmin z15.s, p5/M, z15.s, z1.s\n"
+      "fmin z20.s, p5/M, z20.s, z1.s\n"
+      "fmin z21.s, p5/M, z21.s, z1.s\n"
+      "fmin z22.s, p5/M, z22.s, z1.s\n"
+      "fmin z16.s, p5/M, z16.s, z1.s\n"
+      "fmin z17.s, p5/M, z17.s, z1.s\n"
+      "fmin z18.s, p5/M, z18.s, z1.s\n"
+      "fmin z19.s, p5/M, z19.s, z1.s\n"
+      "fmin z23.s, p5/M, z23.s, z1.s\n"
+      "fmin z28.s, p5/M, z28.s, z1.s\n"
+      "fmin z29.s, p5/M, z29.s, z1.s\n"
+      "fmin z30.s, p5/M, z30.s, z1.s\n"
+      "fmin z24.s, p5/M, z24.s, z1.s\n"
+      "fmin z25.s, p5/M, z25.s, z1.s\n"
+      "fmin z26.s, p5/M, z26.s, z1.s\n"
+      "fmin z27.s, p5/M, z27.s, z1.s\n"
+      "fmax z6.s, p5/M, z6.s, z0.s\n"
+      "fmax z12.s, p5/M, z12.s, z0.s\n"
+      "fmax z13.s, p5/M, z13.s, z0.s\n"
+      "fmax z14.s, p5/M, z14.s, z0.s\n"
+      "fmax z8.s, p5/M, z8.s, z0.s\n"
+      "fmax z9.s, p5/M, z9.s, z0.s\n"
+      "fmax z10.s, p5/M, z10.s, z0.s\n"
+      "fmax z11.s, p5/M, z11.s, z0.s\n"
+      "fmax z15.s, p5/M, z15.s, z0.s\n"
+      "fmax z20.s, p5/M, z20.s, z0.s\n"
+      "fmax z21.s, p5/M, z21.s, z0.s\n"
+      "fmax z22.s, p5/M, z22.s, z0.s\n"
+      "fmax z16.s, p5/M, z16.s, z0.s\n"
+      "fmax z17.s, p5/M, z17.s, z0.s\n"
+      "fmax z18.s, p5/M, z18.s, z0.s\n"
+      "fmax z19.s, p5/M, z19.s, z0.s\n"
+      "fmax z23.s, p5/M, z23.s, z0.s\n"
+      "fmax z28.s, p5/M, z28.s, z0.s\n"
+      "fmax z29.s, p5/M, z29.s, z0.s\n"
+      "fmax z30.s, p5/M, z30.s, z0.s\n"
+      "fmax z24.s, p5/M, z24.s, z0.s\n"
+      "fmax z25.s, p5/M, z25.s, z0.s\n"
+      "fmax z26.s, p5/M, z26.s, z0.s\n"
+      "fmax z27.s, p5/M, z27.s, z0.s\n"
+      "77:"  // Height 6: No activation
+      "st1w { z6.s }, p4, [x28]\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x24]\n"
+      "st1w { z9.s }, p3, [x24, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x24, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z15.s }, p4, [x23]\n"
+      "st1w { z20.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z21.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x22]\n"
+      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z23.s }, p4, [x21]\n"
+      "st1w { z28.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z29.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z30.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x20]\n"
+      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
+      "78:"  // Height 6: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 67b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 80f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 79f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "79:"  // Update direct input
+      "mov x19, #0x18\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "80:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
index c278b3fc6b..c8a7d66f28 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
@@ -22,9 +22,10 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#ifdef ARM_COMPUTE_ENABLE_SVE
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -42,7 +43,8 @@ void sve_hybrid_s8qa_dot_4x4VL( ARGLIST );
 class cls_sve_hybrid_s8qa_dot_4x4VL
 {
 public:
-    typedef int8_t operand_type;
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
     typedef int8_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,7 +70,22 @@ public:
         return false;
     }
 
-    StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 4, 4, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 29.89 };
+                case CPUModel::A510:
+                    return { 17.12 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_s8qa_dot_4x4VL;
@@ -80,4 +97,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
index 8a7465ba6b..3031f5abf5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -158,7 +158,6 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "tbnz %x[flags], #31, 8f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "8:"  // Height 1: Multiply loop: unique 1: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sub x24, x24, #0x10\n"
       "cmp x24, #0x10\n"
       "bgt 7b\n"
@@ -170,7 +169,6 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "ld1rqb { z0.b }, p0/Z, [x23]\n"
       "sdot z16.s, z4.b, z0.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x23, #0x10\n"
       "sdot z17.s, z5.b, z0.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
       "addvl x28, x28, #4\n"
@@ -212,9 +210,8 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "tbnz %x[flags], #31, 11f\n"
       "sdot z11.s, z0.b, z15.b\n"
       "11:"  // Height 1: Multiply loop: unique 2: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
       "cmp x25, x19\n"
       "bne 4b\n"
       "tbnz %x[flags], #31, 12f\n"
@@ -251,16 +248,16 @@ void sve_hybrid_s8qa_dot_4x4VL (
       ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
       "tbz %x[flags], #5, 13f\n"
       "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z17.d, z0.d\n"
       "and z6.d, z18.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "and z7.d, z19.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "asr z6.s, z6.s, #0x1f\n"
       "sqadd z16.s, z16.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z17.s, z17.s, z5.s\n"
       "sqadd z18.s, z18.s, z6.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z19.s, z19.s, z7.s\n"
       "13:"  // Height 1: no shift correction
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
@@ -396,9 +393,7 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "sdot z11.s, z0.b, z15.b\n"
       "sdot z12.s, z1.b, z15.b\n"
       "22:"  // Height 2: Multiply loop: unique 3: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "cmp x24, #0x10\n"
       "bgt 21b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
@@ -409,12 +404,10 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "ld1rqb { z0.b }, p0/Z, [x23]\n"
       "sdot z16.s, z4.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
       "sdot z17.s, z5.b, z0.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "sdot z20.s, z4.b, z1.b[0]\n"
       "addvl x28, x28, #4\n"
       "sdot z21.s, z5.b, z1.b[0]\n"
       "sdot z18.s, z6.b, z0.b[0]\n"
@@ -470,10 +463,8 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "sdot z11.s, z0.b, z15.b\n"
       "sdot z12.s, z1.b, z15.b\n"
       "25:"  // Height 2: Multiply loop: unique 4: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
       "cmp x25, x19\n"
       "bne 18b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -527,27 +518,27 @@ void sve_hybrid_s8qa_dot_4x4VL (
       ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
       "tbz %x[flags], #5, 27f\n"
       "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z17.d, z0.d\n"
       "and z6.d, z18.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
       "and z7.d, z19.d, z0.d\n"
       "and z8.d, z20.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "and z9.d, z21.d, z0.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z10.d, z22.d, z0.d\n"
       "asr z8.s, z8.s, #0x1f\n"
-      "and z4.d, z23.d, z0.d\n"
       "asr z9.s, z9.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z19.s, z19.s, z7.s\n"
       "sqadd z20.s, z20.s, z8.s\n"
       "sqadd z21.s, z21.s, z9.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z22.s, z22.s, z10.s\n"
       "sqadd z23.s, z23.s, z4.s\n"
       "27:"  // Height 2: no shift correction
@@ -731,11 +722,8 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "sdot z12.s, z1.b, z15.b\n"
       "sdot z13.s, z2.b, z15.b\n"
       "36:"  // Height 3: Multiply loop: unique 5: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "cmp x24, #0x10\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "bgt 35b\n"
       "37:"  // Height 3: Multiply loop: Single iteration only
       "ld1b { z4.b }, p2/Z, [x28]\n"
@@ -745,16 +733,13 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "ld1rqb { z0.b }, p0/Z, [x23]\n"
       "sdot z16.s, z4.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
       "sdot z17.s, z5.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z24.s, z4.b, z2.b[0]\n"
+      "sdot z20.s, z4.b, z1.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
       "addvl x28, x28, #4\n"
+      "sdot z24.s, z4.b, z2.b[0]\n"
       "sdot z21.s, z5.b, z1.b[0]\n"
       "sdot z25.s, z5.b, z2.b[0]\n"
       "sdot z18.s, z6.b, z0.b[0]\n"
@@ -825,11 +810,8 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "sdot z12.s, z1.b, z15.b\n"
       "sdot z13.s, z2.b, z15.b\n"
       "39:"  // Height 3: Multiply loop: unique 6: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
       "cmp x25, x19\n"
       "bne 32b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -899,39 +881,39 @@ void sve_hybrid_s8qa_dot_4x4VL (
       ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
       "tbz %x[flags], #5, 41f\n"
       "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z17.d, z0.d\n"
       "and z6.d, z18.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
       "and z7.d, z19.d, z0.d\n"
       "and z8.d, z20.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "and z9.d, z21.d, z0.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z10.d, z22.d, z0.d\n"
       "asr z8.s, z8.s, #0x1f\n"
-      "and z4.d, z23.d, z0.d\n"
       "asr z9.s, z9.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z24.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z19.s, z19.s, z7.s\n"
       "sqadd z20.s, z20.s, z8.s\n"
       "sqadd z21.s, z21.s, z9.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "and z4.d, z23.d, z0.d\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z22.s, z22.s, z10.s\n"
       "sqadd z23.s, z23.s, z4.s\n"
-      "and z6.d, z25.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z24.s, z24.s, z5.s\n"
+      "and z6.d, z25.d, z0.d\n"
       "and z7.d, z26.d, z0.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "and z8.d, z27.d, z0.d\n"
-      "sqadd z25.s, z25.s, z6.s\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z6.s\n"
       "sqadd z26.s, z26.s, z7.s\n"
       "sqadd z27.s, z27.s, z8.s\n"
       "41:"  // Height 3: no shift correction
@@ -1165,12 +1147,8 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "sdot z13.s, z2.b, z15.b\n"
       "sdot z14.s, z3.b, z15.b\n"
       "50:"  // Height 4: Multiply loop: unique 7: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "cmp x24, #0x10\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "bgt 49b\n"
       "51:"  // Height 4: Multiply loop: Single iteration only
       "ld1b { z4.b }, p2/Z, [x28]\n"
@@ -1180,19 +1158,15 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "ld1rqb { z0.b }, p0/Z, [x23]\n"
       "sdot z16.s, z4.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
       "sdot z17.s, z5.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "sdot z20.s, z4.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z24.s, z4.b, z2.b[0]\n"
+      "sdot z20.s, z4.b, z1.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
       "sdot z21.s, z5.b, z1.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
       "addvl x28, x28, #4\n"
+      "sdot z24.s, z4.b, z2.b[0]\n"
       "sdot z28.s, z4.b, z3.b[0]\n"
       "sdot z25.s, z5.b, z2.b[0]\n"
       "sdot z29.s, z5.b, z3.b[0]\n"
@@ -1279,12 +1253,8 @@ void sve_hybrid_s8qa_dot_4x4VL (
       "sdot z13.s, z2.b, z15.b\n"
       "sdot z14.s, z3.b, z15.b\n"
       "53:"  // Height 4: Multiply loop: unique 8: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
       "cmp x25, x19\n"
       "bne 46b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1370,52 +1340,52 @@ void sve_hybrid_s8qa_dot_4x4VL (
       ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
       "tbz %x[flags], #5, 55f\n"
       "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z17.d, z0.d\n"
       "and z6.d, z18.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
       "and z7.d, z19.d, z0.d\n"
       "and z8.d, z20.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "and z9.d, z21.d, z0.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z10.d, z22.d, z0.d\n"
       "asr z8.s, z8.s, #0x1f\n"
-      "and z4.d, z23.d, z0.d\n"
       "asr z9.s, z9.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z24.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z19.s, z19.s, z7.s\n"
       "sqadd z20.s, z20.s, z8.s\n"
       "sqadd z21.s, z21.s, z9.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "and z4.d, z23.d, z0.d\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z22.s, z22.s, z10.s\n"
       "sqadd z23.s, z23.s, z4.s\n"
-      "and z6.d, z25.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z24.s, z24.s, z5.s\n"
+      "and z6.d, z25.d, z0.d\n"
       "and z7.d, z26.d, z0.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "and z8.d, z27.d, z0.d\n"
-      "and z9.d, z28.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "asr z8.s, z8.s, #0x1f\n"
       "sqadd z25.s, z25.s, z6.s\n"
+      "sqadd z26.s, z26.s, z7.s\n"
+      "sqadd z27.s, z27.s, z8.s\n"
+      "and z9.d, z28.d, z0.d\n"
       "and z10.d, z29.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
       "and z4.d, z30.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
       "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z26.s, z26.s, z7.s\n"
-      "and z5.d, z31.d, z0.d\n"
       "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z27.s, z27.s, z8.s\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z28.s, z28.s, z9.s\n"
       "sqadd z29.s, z29.s, z10.s\n"
       "sqadd z30.s, z30.s, z4.s\n"
+      "and z5.d, z31.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z31.s, z31.s, z5.s\n"
       "55:"  // Height 4: no shift correction
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
@@ -1529,4 +1499,4 @@ void sve_hybrid_s8qa_dot_4x4VL (
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp
new file mode 100644
index 0000000000..9681505e8c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<int8_t>, \
+    size_t, size_t, \
+    const int8_t *, \
+    IndirectOutputArg<int8_t>, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_s8qa_mmla_4x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8qa_mmla_4x4VL
+{
+public:
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<int32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 4, 8, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 47.37 };
+                case CPUModel::A510:
+                    return { 20.88 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_s8qa_mmla_4x4VL;
+    cls_sve_hybrid_s8qa_mmla_4x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp
new file mode 100644
index 0000000000..04f80982e8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_mmla_4x4VL/generic.cpp
@@ -0,0 +1,1418 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8qa_mmla_4x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "mov z11.s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov z15.b, #0x1\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[col_bias]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov x26, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x9\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "3:"  // Height 1: setup done
+      "mov x25, #0x0\n"
+      "4:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 5f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "cbnz x25, 6f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "b 6f\n"
+      "5:"  // Height 1: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "6:"  // Height 1: input setup done
+      "cmp x24, #0x10\n"
+      "ble 9f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
+      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
+      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
+      "tbnz %x[flags], #31, 8f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "8:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "bgt 7b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "subs x24, x24, #0x8\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
+      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
+      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
+      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
+      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
+      "ble 10f\n"
+      "ld1b { z6.b }, p2/Z, [x28]\n"
+      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
+      "10:"  // Height 1: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 11f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "11:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 4b\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "mov z23.d, z16.d\n"
+      "tbnz %x[flags], #31, 12f\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1rw { z1.s }, p2/Z, [x22]\n"
+      "mov z11.s, z11.s[0]\n"
+      "neg z1.s, p2/M, z1.s\n"
+      "mul z11.s, p2/M, z11.s, z1.s\n"
+      "12:"  // Height 1: skip row sum fixup
+      "add z23.s, z23.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x27]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "add z23.s, z23.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x23]\n"
+      "add z17.s, z17.s, z1.s\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      "tbz %x[flags], #5, 13f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "and z7.d, z19.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "13:"  // Height 1: no shift correction
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x22, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x22]\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "uzp1 z23.h, z23.h, z17.h\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z23.b, z23.b, z17.b\n"
+      "st1b { z23.b }, p1, [x26]\n"
+      "addvl x26, x26, #1\n"
+      "14:"  // Height 1: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 2b\n"
+      "b 58f\n"
+      "15:"  // Height 2
+      "mov z11.s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z15.b, #0x1\n"
+      "mov x26, %x[output_ptr]\n"
+      "16:"  // Height 2: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x9\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x25, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "cbnz x25, 20f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19\n"
+      "20:"  // Height 2: input setup done
+      "cmp x24, #0x10\n"
+      "ble 23f\n"
+      "21:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
+      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
+      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
+      "tbnz %x[flags], #31, 22f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "22:"  // Height 2: Multiply loop: unique 3: skip row sum
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "bgt 21b\n"
+      "23:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x24, x24, #0x8\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
+      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
+      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
+      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
+      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
+      "ble 24f\n"
+      "ld1b { z6.b }, p2/Z, [x28]\n"
+      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
+      "24:"  // Height 2: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 25f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "25:"  // Height 2: Multiply loop: unique 4: skip row sum
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 18b\n"
+      "uzp1 z7.d, z16.d, z20.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "add x21, x26, x19\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "mov z23.d, z7.d\n"
+      "tbnz %x[flags], #31, 26f\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1rw { z2.s }, p2/Z, [x22]\n"
+      "mov z12.s, z11.s[3]\n"
+      "mov z11.s, z11.s[0]\n"
+      "neg z2.s, p2/M, z2.s\n"
+      "mul z11.s, p2/M, z11.s, z2.s\n"
+      "mul z12.s, p2/M, z12.s, z2.s\n"
+      "26:"  // Height 2: skip row sum fixup
+      "add z23.s, z23.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x27]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z20.s, z20.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add z21.s, z21.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add z22.s, z22.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "add z16.s, z16.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      "add z17.s, z17.s, z12.s\n"
+      "add z18.s, z18.s, z12.s\n"
+      "add z19.s, z19.s, z12.s\n"
+      "add z23.s, z23.s, z0.s\n"
+      "add z20.s, z20.s, z1.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z3.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x23]\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      "tbz %x[flags], #5, 27f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "and z5.d, z20.d, z0.d\n"
+      "and z6.d, z21.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "sqadd z20.s, z20.s, z5.s\n"
+      "sqadd z21.s, z21.s, z6.s\n"
+      "and z7.d, z22.d, z0.d\n"
+      "and z8.d, z16.d, z0.d\n"
+      "and z9.d, z17.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z22.s, z22.s, z7.s\n"
+      "sqadd z16.s, z16.s, z8.s\n"
+      "sqadd z17.s, z17.s, z9.s\n"
+      "and z10.d, z18.d, z0.d\n"
+      "and z4.d, z19.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z10.s\n"
+      "sqadd z19.s, z19.s, z4.s\n"
+      "27:"  // Height 2: no shift correction
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add x22, %x[qp], %[minval]\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x22]\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "add z21.s, z21.s, z4.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z23.h, z23.h, z20.h\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "uzp1 z23.b, z23.b, z20.b\n"
+      "st1b { z23.b }, p1, [x26]\n"
+      "add z17.s, z17.s, z4.s\n"
+      "addvl x26, x26, #1\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x21]\n"
+      "28:"  // Height 2: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 16b\n"
+      "b 58f\n"
+      "29:"  // Height 3
+      "mov z11.s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "mov x26, %x[output_ptr]\n"
+      "mov z15.b, #0x1\n"
+      "30:"  // Height 3: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x9\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "31:"  // Height 3: setup done
+      "mov x25, #0x0\n"
+      "32:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 33f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "cbnz x25, 34f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "b 34f\n"
+      "33:"  // Height 3: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "34:"  // Height 3: input setup done
+      "cmp x24, #0x10\n"
+      "ble 37f\n"
+      "35:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x21]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45059858  // smmla z24.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x4506985c  // smmla z28.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
+      ".inst 0x45079859  // smmla z25.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
+      ".inst 0x4508985d  // smmla z29.s, z2.b, z8.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
+      ".inst 0x4509985a  // smmla z26.s, z2.b, z9.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
+      ".inst 0x450a985e  // smmla z30.s, z2.b, z10.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
+      ".inst 0x4504985b  // smmla z27.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
+      ".inst 0x4505985f  // smmla z31.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
+      ".inst 0x45069878  // smmla z24.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
+      ".inst 0x4507987c  // smmla z28.s, z3.b, z7.b\n"
+      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
+      ".inst 0x45089879  // smmla z25.s, z3.b, z8.b\n"
+      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
+      ".inst 0x4509987d  // smmla z29.s, z3.b, z9.b\n"
+      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
+      ".inst 0x450a987a  // smmla z26.s, z3.b, z10.b\n"
+      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
+      ".inst 0x4504987e  // smmla z30.s, z3.b, z4.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x4505987b  // smmla z27.s, z3.b, z5.b\n"
+      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
+      ".inst 0x4506987f  // smmla z31.s, z3.b, z6.b\n"
+      "tbnz %x[flags], #31, 36f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "sdot z13.s, z3.b, z15.b\n"
+      "36:"  // Height 3: Multiply loop: unique 5: skip row sum
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "bgt 35b\n"
+      "37:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x24, x24, #0x8\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x21]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
+      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45059858  // smmla z24.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4506985c  // smmla z28.s, z2.b, z6.b\n"
+      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
+      ".inst 0x45079859  // smmla z25.s, z2.b, z7.b\n"
+      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
+      ".inst 0x4508985d  // smmla z29.s, z2.b, z8.b\n"
+      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
+      ".inst 0x4509985a  // smmla z26.s, z2.b, z9.b\n"
+      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
+      ".inst 0x450a985e  // smmla z30.s, z2.b, z10.b\n"
+      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
+      ".inst 0x4504985b  // smmla z27.s, z2.b, z4.b\n"
+      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
+      ".inst 0x4505985f  // smmla z31.s, z2.b, z5.b\n"
+      "ble 38f\n"
+      "ld1b { z6.b }, p2/Z, [x28]\n"
+      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45069878  // smmla z24.s, z3.b, z6.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4507987c  // smmla z28.s, z3.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45089879  // smmla z25.s, z3.b, z8.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
+      ".inst 0x4509987d  // smmla z29.s, z3.b, z9.b\n"
+      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
+      ".inst 0x450a987a  // smmla z26.s, z3.b, z10.b\n"
+      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
+      ".inst 0x4504987e  // smmla z30.s, z3.b, z4.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x4505987b  // smmla z27.s, z3.b, z5.b\n"
+      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
+      ".inst 0x4506987f  // smmla z31.s, z3.b, z6.b\n"
+      "38:"  // Height 3: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 39f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "sdot z13.s, z3.b, z15.b\n"
+      "39:"  // Height 3: Multiply loop: unique 6: skip row sum
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 32b\n"
+      "uzp1 z7.d, z16.d, z20.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "add x21, x26, x19\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "add x20, x21, x19\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "mov z31.d, z7.d\n"
+      "tbnz %x[flags], #31, 40f\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1rw { z3.s }, p2/Z, [x22]\n"
+      ".inst 0x4491a9ad  // addp z13.s, p2/m, z13.s, z13.s\n"
+      "mov z12.s, z11.s[3]\n"
+      "mov z11.s, z11.s[0]\n"
+      "neg z3.s, p2/M, z3.s\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z3.s\n"
+      "mul z12.s, p2/M, z12.s, z3.s\n"
+      "mul z13.s, p2/M, z13.s, z3.s\n"
+      "40:"  // Height 3: skip row sum fixup
+      "add z31.s, z31.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x27]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z20.s, z20.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add z21.s, z21.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add z22.s, z22.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "add z16.s, z16.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      "add z17.s, z17.s, z12.s\n"
+      "add z18.s, z18.s, z12.s\n"
+      "add z19.s, z19.s, z12.s\n"
+      "add z24.s, z24.s, z13.s\n"
+      "add z25.s, z25.s, z13.s\n"
+      "add z26.s, z26.s, z13.s\n"
+      "add z27.s, z27.s, z13.s\n"
+      "add z31.s, z31.s, z0.s\n"
+      "add z20.s, z20.s, z1.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z3.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x23]\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
+      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
+      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "tbz %x[flags], #5, 41f\n"
+      "and z4.d, z31.d, z0.d\n"
+      "and z5.d, z20.d, z0.d\n"
+      "and z6.d, z21.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z4.s\n"
+      "sqadd z20.s, z20.s, z5.s\n"
+      "sqadd z21.s, z21.s, z6.s\n"
+      "and z7.d, z22.d, z0.d\n"
+      "and z8.d, z16.d, z0.d\n"
+      "and z9.d, z17.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z22.s, z22.s, z7.s\n"
+      "sqadd z16.s, z16.s, z8.s\n"
+      "sqadd z17.s, z17.s, z9.s\n"
+      "and z10.d, z18.d, z0.d\n"
+      "and z4.d, z19.d, z0.d\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z10.s\n"
+      "sqadd z19.s, z19.s, z4.s\n"
+      "sqadd z24.s, z24.s, z5.s\n"
+      "and z6.d, z25.d, z0.d\n"
+      "and z7.d, z26.d, z0.d\n"
+      "and z8.d, z27.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "sqadd z26.s, z26.s, z7.s\n"
+      "sqadd z27.s, z27.s, z8.s\n"
+      "41:"  // Height 3: no shift correction
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add x22, %x[qp], %[minval]\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x22]\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z31.s, z31.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "add z21.s, z21.s, z4.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "smin z31.s, p2/M, z31.s, z6.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smax z31.s, p2/M, z31.s, z5.s\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z31.h, z31.h, z20.h\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "uzp1 z31.b, z31.b, z20.b\n"
+      "st1b { z31.b }, p1, [x26]\n"
+      "add z17.s, z17.s, z4.s\n"
+      "addvl x26, x26, #1\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x21]\n"
+      "add z26.s, z26.s, z4.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "add z27.s, z27.s, z4.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x20]\n"
+      "42:"  // Height 3: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 30b\n"
+      "b 58f\n"
+      "43:"  // Height 4
+      "mov z11.s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "mov z14.s, #0x0\n"
+      "mov x19, #0x4\n"
+      "mov z15.b, #0x1\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "44:"  // Height 4: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x9\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "45:"  // Height 4: setup done
+      "mov x25, #0x0\n"
+      "46:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 47f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x25, 48f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "add x20, x20, x19\n"
+      "b 48f\n"
+      "47:"  // Height 4: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "add x20, x21, x19\n"
+      "48:"  // Height 4: input setup done
+      "cmp x24, #0x10\n"
+      "ble 51f\n"
+      "49:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x21]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x20]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45059858  // smmla z24.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x4506985c  // smmla z28.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45079859  // smmla z25.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
+      ".inst 0x4508985d  // smmla z29.s, z2.b, z8.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
+      ".inst 0x4509985a  // smmla z26.s, z2.b, z9.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
+      ".inst 0x450a985e  // smmla z30.s, z2.b, z10.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
+      ".inst 0x4504985b  // smmla z27.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
+      ".inst 0x4505985f  // smmla z31.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
+      ".inst 0x45069878  // smmla z24.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
+      ".inst 0x4507987c  // smmla z28.s, z3.b, z7.b\n"
+      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
+      ".inst 0x45089879  // smmla z25.s, z3.b, z8.b\n"
+      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
+      ".inst 0x4509987d  // smmla z29.s, z3.b, z9.b\n"
+      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
+      ".inst 0x450a987a  // smmla z26.s, z3.b, z10.b\n"
+      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
+      ".inst 0x4504987e  // smmla z30.s, z3.b, z4.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x4505987b  // smmla z27.s, z3.b, z5.b\n"
+      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
+      ".inst 0x4506987f  // smmla z31.s, z3.b, z6.b\n"
+      "tbnz %x[flags], #31, 50f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "sdot z13.s, z3.b, z15.b\n"
+      "50:"  // Height 4: Multiply loop: unique 7: skip row sum
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "bgt 49b\n"
+      "51:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x24, x24, #0x8\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x21]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x20]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45059810  // smmla z16.s, z0.b, z5.b\n"
+      ".inst 0x45069814  // smmla z20.s, z0.b, z6.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45079811  // smmla z17.s, z0.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45059858  // smmla z24.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4506985c  // smmla z28.s, z2.b, z6.b\n"
+      ".inst 0x45079859  // smmla z25.s, z2.b, z7.b\n"
+      ".inst 0x45089815  // smmla z21.s, z0.b, z8.b\n"
+      ".inst 0x4508985d  // smmla z29.s, z2.b, z8.b\n"
+      ".inst 0x45099812  // smmla z18.s, z0.b, z9.b\n"
+      ".inst 0x4509985a  // smmla z26.s, z2.b, z9.b\n"
+      ".inst 0x450a9816  // smmla z22.s, z0.b, z10.b\n"
+      ".inst 0x450a985e  // smmla z30.s, z2.b, z10.b\n"
+      ".inst 0x45049813  // smmla z19.s, z0.b, z4.b\n"
+      ".inst 0x4504985b  // smmla z27.s, z2.b, z4.b\n"
+      ".inst 0x45059817  // smmla z23.s, z0.b, z5.b\n"
+      ".inst 0x4505985f  // smmla z31.s, z2.b, z5.b\n"
+      "ble 52f\n"
+      "ld1b { z6.b }, p2/Z, [x28]\n"
+      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45069878  // smmla z24.s, z3.b, z6.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079834  // smmla z20.s, z1.b, z7.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4507987c  // smmla z28.s, z3.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45089831  // smmla z17.s, z1.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45089879  // smmla z25.s, z3.b, z8.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45099835  // smmla z21.s, z1.b, z9.b\n"
+      ".inst 0x4509987d  // smmla z29.s, z3.b, z9.b\n"
+      ".inst 0x450a9832  // smmla z18.s, z1.b, z10.b\n"
+      ".inst 0x450a987a  // smmla z26.s, z3.b, z10.b\n"
+      ".inst 0x45049836  // smmla z22.s, z1.b, z4.b\n"
+      ".inst 0x4504987e  // smmla z30.s, z3.b, z4.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x4505987b  // smmla z27.s, z3.b, z5.b\n"
+      ".inst 0x45069837  // smmla z23.s, z1.b, z6.b\n"
+      ".inst 0x4506987f  // smmla z31.s, z3.b, z6.b\n"
+      "52:"  // Height 4: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 53f\n"
+      "sdot z11.s, z0.b, z15.b\n"
+      "sdot z13.s, z2.b, z15.b\n"
+      "sdot z11.s, z1.b, z15.b\n"
+      "sdot z13.s, z3.b, z15.b\n"
+      "53:"  // Height 4: Multiply loop: unique 8: skip row sum
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 46b\n"
+      "uzp1 z7.d, z16.d, z20.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "add x21, x26, x19\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "add x20, x21, x19\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "add x19, x20, x19\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "mov z31.d, z7.d\n"
+      "tbnz %x[flags], #31, 54f\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      ".inst 0x4491a9ad  // addp z13.s, p2/m, z13.s, z13.s\n"
+      "mov z12.s, z11.s[3]\n"
+      "mov z11.s, z11.s[0]\n"
+      "neg z4.s, p2/M, z4.s\n"
+      "mov z14.s, z13.s[3]\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z4.s\n"
+      "mul z12.s, p2/M, z12.s, z4.s\n"
+      "mul z13.s, p2/M, z13.s, z4.s\n"
+      "mul z14.s, p2/M, z14.s, z4.s\n"
+      "54:"  // Height 4: skip row sum fixup
+      "add z31.s, z31.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x27]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z20.s, z20.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add z21.s, z21.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add z22.s, z22.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "add z16.s, z16.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      "add z17.s, z17.s, z12.s\n"
+      "add z18.s, z18.s, z12.s\n"
+      "add z19.s, z19.s, z12.s\n"
+      "add z23.s, z23.s, z13.s\n"
+      "add z28.s, z28.s, z13.s\n"
+      "add z29.s, z29.s, z13.s\n"
+      "add z30.s, z30.s, z13.s\n"
+      "add z24.s, z24.s, z14.s\n"
+      "add z25.s, z25.s, z14.s\n"
+      "add z26.s, z26.s, z14.s\n"
+      "add z27.s, z27.s, z14.s\n"
+      "add z31.s, z31.s, z0.s\n"
+      "add z20.s, z20.s, z1.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z3.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z23.s, z23.s, z0.s\n"
+      "add z28.s, z28.s, z1.s\n"
+      "add z29.s, z29.s, z2.s\n"
+      "add z30.s, z30.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x23]\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
+      ".inst 0x04a477bd  // sqrdmulh z29.s, z29.s, z4.s\n"
+      ".inst 0x04a477de  // sqrdmulh z30.s, z30.s, z4.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
+      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
+      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "tbz %x[flags], #5, 55f\n"
+      "and z4.d, z31.d, z0.d\n"
+      "and z5.d, z20.d, z0.d\n"
+      "and z6.d, z21.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z4.s\n"
+      "sqadd z20.s, z20.s, z5.s\n"
+      "sqadd z21.s, z21.s, z6.s\n"
+      "and z7.d, z22.d, z0.d\n"
+      "and z8.d, z16.d, z0.d\n"
+      "and z9.d, z17.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z22.s, z22.s, z7.s\n"
+      "sqadd z16.s, z16.s, z8.s\n"
+      "sqadd z17.s, z17.s, z9.s\n"
+      "and z10.d, z18.d, z0.d\n"
+      "and z4.d, z19.d, z0.d\n"
+      "and z5.d, z23.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z10.s\n"
+      "sqadd z19.s, z19.s, z4.s\n"
+      "sqadd z23.s, z23.s, z5.s\n"
+      "and z6.d, z28.d, z0.d\n"
+      "and z7.d, z29.d, z0.d\n"
+      "and z8.d, z30.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z6.s\n"
+      "sqadd z29.s, z29.s, z7.s\n"
+      "sqadd z30.s, z30.s, z8.s\n"
+      "and z9.d, z24.d, z0.d\n"
+      "and z10.d, z25.d, z0.d\n"
+      "and z4.d, z26.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z9.s\n"
+      "sqadd z25.s, z25.s, z10.s\n"
+      "sqadd z26.s, z26.s, z4.s\n"
+      "and z5.d, z27.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z27.s, z27.s, z5.s\n"
+      "55:"  // Height 4: no shift correction
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add x22, %x[qp], %[minval]\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x22]\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z31.s, z31.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "add z21.s, z21.s, z4.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "smin z31.s, p2/M, z31.s, z6.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smax z31.s, p2/M, z31.s, z5.s\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z31.h, z31.h, z20.h\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "uzp1 z31.b, z31.b, z20.b\n"
+      "st1b { z31.b }, p1, [x26]\n"
+      "add z17.s, z17.s, z4.s\n"
+      "addvl x26, x26, #1\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z28.s, z28.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smin z28.s, p2/M, z28.s, z6.s\n"
+      ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x21]\n"
+      "add z29.s, z29.s, z4.s\n"
+      "smax z28.s, p2/M, z28.s, z5.s\n"
+      "add z30.s, z30.s, z4.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "smin z29.s, p2/M, z29.s, z6.s\n"
+      "uzp1 z23.h, z23.h, z28.h\n"
+      "smin z30.s, p2/M, z30.s, z6.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "smax z29.s, p2/M, z29.s, z5.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      "smax z30.s, p2/M, z30.s, z5.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "uzp1 z28.h, z29.h, z30.h\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "add z26.s, z26.s, z4.s\n"
+      "uzp1 z23.b, z23.b, z28.b\n"
+      "st1b { z23.b }, p1, [x20]\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "add z27.s, z27.s, z4.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x19]\n"
+      "56:"  // Height 4: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 44b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 58f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 57f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "57:"  // Update direct input
+      "mov x19, #0x4\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "58:"  // Exit
+
+      : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
index 57056b4c2a..dad04c81e8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
@@ -22,9 +22,10 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#ifdef ARM_COMPUTE_ENABLE_SVE
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -42,7 +43,8 @@ void sve_hybrid_s8qs_dot_6x4VL( ARGLIST );
 class cls_sve_hybrid_s8qs_dot_6x4VL
 {
 public:
-    typedef int8_t operand_type;
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
     typedef int8_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,7 +70,22 @@ public:
         return false;
     }
 
-    StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 30.13 };
+                case CPUModel::A510:
+                    return { 19.77 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_s8qs_dot_6x4VL;
@@ -80,4 +97,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
index 0328c107e2..6b08d2834b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -135,13 +135,12 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "cmp x26, #0x10\n"
+      "add x25, x25, #0x10\n"
       "sdot z10.s, z6.b, z0.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "sdot z11.s, z7.b, z0.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #5, MUL VL]\n"
       "sdot z8.s, z6.b, z0.b[1]\n"
@@ -176,7 +175,6 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
       "addvl x28, x28, #4\n"
@@ -215,9 +213,8 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "sdot z10.s, z6.b, z0.b[3]\n"
       "sdot z11.s, z7.b, z0.b[3]\n"
       "9:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 4b\n"
       "ld1w { z0.s }, p2/Z, [x11]\n"
@@ -259,16 +256,16 @@ void sve_hybrid_s8qs_dot_6x4VL (
       ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
       "tbz %x[flags], #5, 12f\n"
       "and z4.d, z8.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z9.d, z1.d\n"
       "and z6.d, z10.d, z2.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "and z7.d, z11.d, z3.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "asr z6.s, z6.s, #0x1f\n"
       "sqadd z8.s, z8.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z9.s, z9.s, z5.s\n"
       "sqadd z10.s, z10.s, z6.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z11.s, z11.s, z7.s\n"
       "12:"  // Height 1: no shift correction
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
@@ -351,16 +348,14 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "cmp x26, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "sdot z10.s, z6.b, z0.b[0]\n"
       "sdot z14.s, z6.b, z1.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #4, MUL VL]\n"
@@ -411,9 +406,7 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
-      "add x24, x24, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
@@ -468,10 +461,8 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "sdot z11.s, z7.b, z0.b[3]\n"
       "sdot z15.s, z7.b, z1.b[3]\n"
       "22:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 17b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -523,27 +514,27 @@ void sve_hybrid_s8qs_dot_6x4VL (
       ".inst 0x04a775ef  // sqrdmulh z15.s, z15.s, z7.s\n"
       "tbz %x[flags], #5, 25f\n"
       "and z4.d, z8.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z9.d, z1.d\n"
       "and z6.d, z10.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z11.d, z3.d\n"
       "asr z6.s, z6.s, #0x1f\n"
       "sqadd z8.s, z8.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "and z4.d, z12.d, z0.d\n"
       "sqadd z9.s, z9.s, z5.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z10.s, z10.s, z6.s\n"
+      "and z7.d, z11.d, z3.d\n"
+      "and z4.d, z12.d, z0.d\n"
       "and z5.d, z13.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
       "sqadd z11.s, z11.s, z7.s\n"
-      "and z6.d, z14.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z12.s, z12.s, z4.s\n"
+      "sqadd z13.s, z13.s, z5.s\n"
+      "and z6.d, z14.d, z2.d\n"
       "and z7.d, z15.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z13.s, z13.s, z5.s\n"
       "sqadd z14.s, z14.s, z6.s\n"
       "sqadd z15.s, z15.s, z7.s\n"
       "25:"  // Height 2: no shift correction
@@ -654,21 +645,18 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
       "add x23, x23, #0x10\n"
       "sdot z16.s, z6.b, z2.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "cmp x26, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "sdot z17.s, z7.b, z2.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sdot z10.s, z6.b, z0.b[0]\n"
       "sdot z14.s, z6.b, z1.b[0]\n"
       "sdot z18.s, z6.b, z2.b[0]\n"
@@ -733,12 +721,9 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
-      "add x23, x23, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
       "sdot z16.s, z6.b, z2.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
@@ -808,11 +793,8 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "sdot z15.s, z7.b, z1.b[3]\n"
       "sdot z19.s, z7.b, z2.b[3]\n"
       "35:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 30b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -873,37 +855,37 @@ void sve_hybrid_s8qs_dot_6x4VL (
       ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
       "tbz %x[flags], #5, 38f\n"
       "and z4.d, z8.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z9.d, z1.d\n"
       "and z6.d, z10.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z11.d, z3.d\n"
       "asr z6.s, z6.s, #0x1f\n"
       "sqadd z8.s, z8.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "and z4.d, z12.d, z0.d\n"
       "sqadd z9.s, z9.s, z5.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z10.s, z10.s, z6.s\n"
+      "and z7.d, z11.d, z3.d\n"
+      "and z4.d, z12.d, z0.d\n"
       "and z5.d, z13.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
       "sqadd z11.s, z11.s, z7.s\n"
-      "and z6.d, z14.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z12.s, z12.s, z4.s\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z13.s, z13.s, z5.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "and z7.d, z15.d, z3.d\n"
       "and z4.d, z16.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "asr z4.s, z4.s, #0x1f\n"
       "sqadd z14.s, z14.s, z6.s\n"
-      "and z5.d, z17.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z15.s, z15.s, z7.s\n"
-      "and z6.d, z18.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z16.s, z16.s, z4.s\n"
+      "and z5.d, z17.d, z1.d\n"
+      "and z6.d, z18.d, z2.d\n"
       "and z7.d, z19.d, z3.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
       "asr z7.s, z7.s, #0x1f\n"
       "sqadd z17.s, z17.s, z5.s\n"
       "sqadd z18.s, z18.s, z6.s\n"
@@ -1043,26 +1025,22 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "sdot z16.s, z6.b, z2.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
       "sdot z20.s, z6.b, z3.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sdot z21.s, z7.b, z3.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "sdot z10.s, z6.b, z0.b[0]\n"
       "sdot z14.s, z6.b, z1.b[0]\n"
       "sdot z18.s, z6.b, z2.b[0]\n"
@@ -1141,19 +1119,15 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "add x22, x22, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
       "sdot z20.s, z6.b, z3.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
       "sdot z21.s, z7.b, z3.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
       "addvl x28, x28, #4\n"
@@ -1234,12 +1208,8 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "sdot z19.s, z7.b, z2.b[3]\n"
       "sdot z23.s, z7.b, z3.b[3]\n"
       "48:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 43b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1309,52 +1279,52 @@ void sve_hybrid_s8qs_dot_6x4VL (
       ".inst 0x04a776f7  // sqrdmulh z23.s, z23.s, z7.s\n"
       "tbz %x[flags], #5, 51f\n"
       "and z4.d, z8.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z9.d, z1.d\n"
       "and z6.d, z10.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z11.d, z3.d\n"
       "asr z6.s, z6.s, #0x1f\n"
       "sqadd z8.s, z8.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "and z4.d, z12.d, z0.d\n"
       "sqadd z9.s, z9.s, z5.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z10.s, z10.s, z6.s\n"
+      "and z7.d, z11.d, z3.d\n"
+      "and z4.d, z12.d, z0.d\n"
       "and z5.d, z13.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
       "sqadd z11.s, z11.s, z7.s\n"
-      "and z6.d, z14.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z12.s, z12.s, z4.s\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z13.s, z13.s, z5.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "and z7.d, z15.d, z3.d\n"
       "and z4.d, z16.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "asr z4.s, z4.s, #0x1f\n"
       "sqadd z14.s, z14.s, z6.s\n"
-      "and z5.d, z17.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z15.s, z15.s, z7.s\n"
-      "and z6.d, z18.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z16.s, z16.s, z4.s\n"
+      "and z5.d, z17.d, z1.d\n"
+      "and z6.d, z18.d, z2.d\n"
       "and z7.d, z19.d, z3.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
       "asr z7.s, z7.s, #0x1f\n"
       "sqadd z17.s, z17.s, z5.s\n"
-      "and z4.d, z20.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z18.s, z18.s, z6.s\n"
-      "and z5.d, z21.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z19.s, z19.s, z7.s\n"
+      "and z4.d, z20.d, z0.d\n"
+      "and z5.d, z21.d, z1.d\n"
       "and z6.d, z22.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "asr z6.s, z6.s, #0x1f\n"
       "sqadd z20.s, z20.s, z4.s\n"
-      "and z7.d, z23.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z21.s, z21.s, z5.s\n"
       "sqadd z22.s, z22.s, z6.s\n"
+      "and z7.d, z23.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z23.s, z23.s, z7.s\n"
       "51:"  // Height 4: no shift correction
       ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
@@ -1518,32 +1488,27 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "sdot z16.s, z6.b, z2.b[0]\n"
       "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
       "add x21, x21, #0x10\n"
       "sdot z20.s, z6.b, z3.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
       "sdot z24.s, z6.b, z4.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sdot z21.s, z7.b, z3.b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "sdot z25.s, z7.b, z4.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
       "sdot z10.s, z6.b, z0.b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "sdot z14.s, z6.b, z1.b[0]\n"
       "sdot z18.s, z6.b, z2.b[0]\n"
       "sdot z22.s, z6.b, z3.b[0]\n"
@@ -1635,22 +1600,17 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
       "sdot z20.s, z6.b, z3.b[0]\n"
       "sdot z24.s, z6.b, z4.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
       "sdot z21.s, z7.b, z3.b[0]\n"
       "sdot z25.s, z7.b, z4.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
@@ -1746,13 +1706,8 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "sdot z23.s, z7.b, z3.b[3]\n"
       "sdot z27.s, z7.b, z4.b[3]\n"
       "61:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 56b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1831,63 +1786,63 @@ void sve_hybrid_s8qs_dot_6x4VL (
       ".inst 0x04a7777b  // sqrdmulh z27.s, z27.s, z7.s\n"
       "tbz %x[flags], #5, 64f\n"
       "and z4.d, z8.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z9.d, z1.d\n"
       "and z6.d, z10.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z11.d, z3.d\n"
       "asr z6.s, z6.s, #0x1f\n"
       "sqadd z8.s, z8.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "and z4.d, z12.d, z0.d\n"
       "sqadd z9.s, z9.s, z5.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z10.s, z10.s, z6.s\n"
+      "and z7.d, z11.d, z3.d\n"
+      "and z4.d, z12.d, z0.d\n"
       "and z5.d, z13.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
       "sqadd z11.s, z11.s, z7.s\n"
-      "and z6.d, z14.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z12.s, z12.s, z4.s\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z13.s, z13.s, z5.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "and z7.d, z15.d, z3.d\n"
       "and z4.d, z16.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "asr z4.s, z4.s, #0x1f\n"
       "sqadd z14.s, z14.s, z6.s\n"
-      "and z5.d, z17.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z15.s, z15.s, z7.s\n"
-      "and z6.d, z18.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z16.s, z16.s, z4.s\n"
+      "and z5.d, z17.d, z1.d\n"
+      "and z6.d, z18.d, z2.d\n"
       "and z7.d, z19.d, z3.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
       "asr z7.s, z7.s, #0x1f\n"
       "sqadd z17.s, z17.s, z5.s\n"
-      "and z4.d, z20.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z18.s, z18.s, z6.s\n"
-      "and z5.d, z21.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z19.s, z19.s, z7.s\n"
+      "and z4.d, z20.d, z0.d\n"
+      "and z5.d, z21.d, z1.d\n"
       "and z6.d, z22.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "asr z6.s, z6.s, #0x1f\n"
       "sqadd z20.s, z20.s, z4.s\n"
-      "and z7.d, z23.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z21.s, z21.s, z5.s\n"
-      "and z4.d, z24.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z22.s, z22.s, z6.s\n"
+      "and z7.d, z23.d, z3.d\n"
+      "and z4.d, z24.d, z0.d\n"
       "and z5.d, z25.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
       "sqadd z23.s, z23.s, z7.s\n"
-      "and z6.d, z26.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z24.s, z24.s, z4.s\n"
+      "sqadd z25.s, z25.s, z5.s\n"
+      "and z6.d, z26.d, z2.d\n"
       "and z7.d, z27.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z25.s, z25.s, z5.s\n"
       "sqadd z26.s, z26.s, z6.s\n"
       "sqadd z27.s, z27.s, z7.s\n"
       "64:"  // Height 5: no shift correction
@@ -2082,37 +2037,31 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "sdot z16.s, z6.b, z2.b[0]\n"
       "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
       "ld1rqb { z5.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       "sdot z20.s, z6.b, z3.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
       "add x20, x20, #0x10\n"
       "sdot z24.s, z6.b, z4.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
       "sdot z28.s, z6.b, z5.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sdot z21.s, z7.b, z3.b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "sdot z25.s, z7.b, z4.b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "sdot z29.s, z7.b, z5.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
       "sdot z10.s, z6.b, z0.b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "sdot z14.s, z6.b, z1.b[0]\n"
       "sdot z18.s, z6.b, z2.b[0]\n"
       "sdot z22.s, z6.b, z3.b[0]\n"
@@ -2218,25 +2167,19 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
       "ld1rqb { z5.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
       "sdot z20.s, z6.b, z3.b[0]\n"
-      "add x20, x20, #0x10\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
       "sdot z24.s, z6.b, z4.b[0]\n"
       "sdot z28.s, z6.b, z5.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
       "sdot z21.s, z7.b, z3.b[0]\n"
       "sdot z25.s, z7.b, z4.b[0]\n"
       "sdot z29.s, z7.b, z5.b[0]\n"
@@ -2347,14 +2290,8 @@ void sve_hybrid_s8qs_dot_6x4VL (
       "sdot z27.s, z7.b, z4.b[3]\n"
       "sdot z31.s, z7.b, z5.b[3]\n"
       "74:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 69b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -2442,73 +2379,73 @@ void sve_hybrid_s8qs_dot_6x4VL (
       ".inst 0x04a777ff  // sqrdmulh z31.s, z31.s, z7.s\n"
       "tbz %x[flags], #5, 77f\n"
       "and z4.d, z8.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z9.d, z1.d\n"
       "and z6.d, z10.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
-      "and z7.d, z11.d, z3.d\n"
       "asr z6.s, z6.s, #0x1f\n"
       "sqadd z8.s, z8.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
-      "and z4.d, z12.d, z0.d\n"
       "sqadd z9.s, z9.s, z5.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z10.s, z10.s, z6.s\n"
+      "and z7.d, z11.d, z3.d\n"
+      "and z4.d, z12.d, z0.d\n"
       "and z5.d, z13.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
       "sqadd z11.s, z11.s, z7.s\n"
-      "and z6.d, z14.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z12.s, z12.s, z4.s\n"
-      "and z7.d, z15.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z13.s, z13.s, z5.s\n"
+      "and z6.d, z14.d, z2.d\n"
+      "and z7.d, z15.d, z3.d\n"
       "and z4.d, z16.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "asr z4.s, z4.s, #0x1f\n"
       "sqadd z14.s, z14.s, z6.s\n"
-      "and z5.d, z17.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z15.s, z15.s, z7.s\n"
-      "and z6.d, z18.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z16.s, z16.s, z4.s\n"
+      "and z5.d, z17.d, z1.d\n"
+      "and z6.d, z18.d, z2.d\n"
       "and z7.d, z19.d, z3.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
       "asr z7.s, z7.s, #0x1f\n"
       "sqadd z17.s, z17.s, z5.s\n"
-      "and z4.d, z20.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z18.s, z18.s, z6.s\n"
-      "and z5.d, z21.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z19.s, z19.s, z7.s\n"
+      "and z4.d, z20.d, z0.d\n"
+      "and z5.d, z21.d, z1.d\n"
       "and z6.d, z22.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "asr z6.s, z6.s, #0x1f\n"
       "sqadd z20.s, z20.s, z4.s\n"
-      "and z7.d, z23.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z21.s, z21.s, z5.s\n"
-      "and z4.d, z24.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z22.s, z22.s, z6.s\n"
+      "and z7.d, z23.d, z3.d\n"
+      "and z4.d, z24.d, z0.d\n"
       "and z5.d, z25.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
       "sqadd z23.s, z23.s, z7.s\n"
-      "and z6.d, z26.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z24.s, z24.s, z4.s\n"
-      "and z7.d, z27.d, z3.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z25.s, z25.s, z5.s\n"
+      "and z6.d, z26.d, z2.d\n"
+      "and z7.d, z27.d, z3.d\n"
       "and z4.d, z28.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "asr z4.s, z4.s, #0x1f\n"
       "sqadd z26.s, z26.s, z6.s\n"
-      "and z5.d, z29.d, z1.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z27.s, z27.s, z7.s\n"
-      "and z6.d, z30.d, z2.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z28.s, z28.s, z4.s\n"
+      "and z5.d, z29.d, z1.d\n"
+      "and z6.d, z30.d, z2.d\n"
       "and z7.d, z31.d, z3.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
       "asr z7.s, z7.s, #0x1f\n"
       "sqadd z29.s, z29.s, z5.s\n"
       "sqadd z30.s, z30.s, z6.s\n"
@@ -2665,4 +2602,4 @@ void sve_hybrid_s8qs_dot_6x4VL (
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp
new file mode 100644
index 0000000000..2b7ad8bf4b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<int8_t>, \
+    size_t, size_t, \
+    const int8_t *, \
+    IndirectOutputArg<int8_t>, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_s8qs_mmla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8qs_mmla_6x4VL
+{
+public:
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
+    typedef int8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<int32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 49.98 };
+                case CPUModel::A510:
+                    return { 22.62 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_s8qs_mmla_6x4VL;
+    cls_sve_hybrid_s8qs_mmla_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..6aba002706
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_mmla_6x4VL/generic.cpp
@@ -0,0 +1,2431 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8qs_mmla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+    struct KernelArgs {
+        const int32_t *multiplier_ptr = {};
+        const int32_t *shift_ptr = {};
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->per_channel_requant) {
+        flags |= 0x10;
+        ka.multiplier_ptr=qp->per_channel_muls + col_base;
+        ka.shift_ptr=qp->per_channel_right_shifts + col_base;
+    }
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 66f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 53f\n"
+      "beq 40f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 27f\n"
+      "beq 14f\n"
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x11, %x[col_bias]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "2:"  // Height 1: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x10\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "3:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "4:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 5f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 6f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "b 6f\n"
+      "5:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "6:"  // Height 1: input setup done
+      "cmp x26, #0x10\n"
+      "ble 8f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "cmp x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      "bgt 7b\n"
+      "8:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "subs x26, x26, #0x8\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      "ble 9f\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      "9:"  // Height 1: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 4b\n"
+      "uzp1 z8.d, z8.d, z12.d\n"
+      "ld1w { z0.s }, p2/Z, [x11]\n"
+      "uzp1 z9.d, z9.d, z13.d\n"
+      "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
+      "uzp1 z10.d, z10.d, z14.d\n"
+      "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
+      "uzp1 z11.d, z11.d, z15.d\n"
+      "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "mov z15.d, z8.d\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z15.s, z15.s, z0.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "tbz %x[flags], #4, 10f\n"
+      "ld1w { z0.s }, p2/Z, [x12]\n"
+      "ld1w { z4.s }, p2/Z, [x13]\n"
+      "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "b 11f\n"
+      "10:"  // Height 1: per layer parameters
+      "add x24, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x24]\n"
+      "mov z1.d, z0.d\n"
+      "add x24, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "11:"  // Height 1: parameters loaded
+      ".inst 0x04a475ef  // sqrdmulh z15.s, z15.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      "tbz %x[flags], #5, 12f\n"
+      "and z4.d, z15.d, z0.d\n"
+      "and z5.d, z9.d, z1.d\n"
+      "and z6.d, z10.d, z2.d\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z4.s\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "12:"  // Height 1: no shift correction
+      ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
+      "add x24, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "add x24, %x[qp], %[minval]\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x24]\n"
+      "add x24, %x[qp], %[maxval]\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x24]\n"
+      "add z15.s, z15.s, z4.s\n"
+      "add z9.s, z9.s, z4.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "uzp1 z15.h, z15.h, z9.h\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "uzp1 z15.b, z15.b, z9.b\n"
+      "st1b { z15.b }, p1, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "13:"  // Height 1: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 2b\n"
+      "b 80f\n"
+      "14:"  // Height 2
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x11, %x[col_bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "15:"  // Height 2: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x10\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "16:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "17:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 18f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 19f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "b 19f\n"
+      "18:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "19:"  // Height 2: input setup done
+      "cmp x26, #0x10\n"
+      "ble 21f\n"
+      "20:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sub x26, x26, #0x10\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "cmp x26, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "add x25, x25, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      "bgt 20b\n"
+      "21:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x26, x26, #0x8\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      "ble 22f\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      "22:"  // Height 2: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 17b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "ld1w { z0.s }, p2/Z, [x11]\n"
+      "add x23, x9, x19\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "mov z15.d, z7.d\n"
+      "add z15.s, z15.s, z0.s\n"
+      "add z12.s, z12.s, z1.s\n"
+      "add z13.s, z13.s, z2.s\n"
+      "add z14.s, z14.s, z3.s\n"
+      "add z8.s, z8.s, z0.s\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "tbz %x[flags], #4, 23f\n"
+      "ld1w { z0.s }, p2/Z, [x12]\n"
+      "ld1w { z4.s }, p2/Z, [x13]\n"
+      "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "b 24f\n"
+      "23:"  // Height 2: per layer parameters
+      "add x24, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x24]\n"
+      "mov z1.d, z0.d\n"
+      "add x24, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "24:"  // Height 2: parameters loaded
+      ".inst 0x04a475ef  // sqrdmulh z15.s, z15.s, z4.s\n"
+      ".inst 0x04a5758c  // sqrdmulh z12.s, z12.s, z5.s\n"
+      ".inst 0x04a675ad  // sqrdmulh z13.s, z13.s, z6.s\n"
+      ".inst 0x04a775ce  // sqrdmulh z14.s, z14.s, z7.s\n"
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      "tbz %x[flags], #5, 25f\n"
+      "and z4.d, z15.d, z0.d\n"
+      "and z5.d, z12.d, z1.d\n"
+      "and z6.d, z13.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z15.s, z15.s, z4.s\n"
+      "sqadd z12.s, z12.s, z5.s\n"
+      "sqadd z13.s, z13.s, z6.s\n"
+      "and z7.d, z14.d, z3.d\n"
+      "and z4.d, z8.d, z0.d\n"
+      "and z5.d, z9.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z14.s, z14.s, z7.s\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "and z6.d, z10.d, z2.d\n"
+      "and z7.d, z11.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "25:"  // Height 2: no shift correction
+      ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
+      "add x24, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
+      "add x24, %x[qp], %[minval]\n"
+      ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x24]\n"
+      "add x24, %x[qp], %[maxval]\n"
+      ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x24]\n"
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "add z13.s, z13.s, z4.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "uzp1 z15.h, z15.h, z12.h\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "uzp1 z12.h, z13.h, z14.h\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "uzp1 z15.b, z15.b, z12.b\n"
+      "st1b { z15.b }, p1, [x9]\n"
+      "add z9.s, z9.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x23]\n"
+      "26:"  // Height 2: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 15b\n"
+      "b 80f\n"
+      "27:"  // Height 3
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x11, %x[col_bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "28:"  // Height 3: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x10\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "29:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "30:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 31f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 32f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "b 32f\n"
+      "31:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "32:"  // Height 3: input setup done
+      "cmp x26, #0x10\n"
+      "ble 34f\n"
+      "33:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "sub x26, x26, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "cmp x26, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "add x23, x23, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      "bgt 33b\n"
+      "34:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x26, x26, #0x8\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      "ble 35f\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      "35:"  // Height 3: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 30b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "ld1w { z0.s }, p2/Z, [x11]\n"
+      "add x23, x9, x19\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x22, x23, x19\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "mov z23.d, z7.d\n"
+      "add z23.s, z23.s, z0.s\n"
+      "add z12.s, z12.s, z1.s\n"
+      "add z13.s, z13.s, z2.s\n"
+      "add z14.s, z14.s, z3.s\n"
+      "add z8.s, z8.s, z0.s\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "tbz %x[flags], #4, 36f\n"
+      "ld1w { z0.s }, p2/Z, [x12]\n"
+      "ld1w { z4.s }, p2/Z, [x13]\n"
+      "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "b 37f\n"
+      "36:"  // Height 3: per layer parameters
+      "add x24, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x24]\n"
+      "mov z1.d, z0.d\n"
+      "add x24, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "37:"  // Height 3: parameters loaded
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a5758c  // sqrdmulh z12.s, z12.s, z5.s\n"
+      ".inst 0x04a675ad  // sqrdmulh z13.s, z13.s, z6.s\n"
+      ".inst 0x04a775ce  // sqrdmulh z14.s, z14.s, z7.s\n"
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      "tbz %x[flags], #5, 38f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "and z5.d, z12.d, z1.d\n"
+      "and z6.d, z13.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "sqadd z12.s, z12.s, z5.s\n"
+      "sqadd z13.s, z13.s, z6.s\n"
+      "and z7.d, z14.d, z3.d\n"
+      "and z4.d, z8.d, z0.d\n"
+      "and z5.d, z9.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z14.s, z14.s, z7.s\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "and z6.d, z10.d, z2.d\n"
+      "and z7.d, z11.d, z3.d\n"
+      "and z4.d, z16.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "and z5.d, z17.d, z1.d\n"
+      "and z6.d, z18.d, z2.d\n"
+      "and z7.d, z19.d, z3.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "38:"  // Height 3: no shift correction
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add x24, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
+      "add x24, %x[qp], %[minval]\n"
+      ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x24]\n"
+      "add x24, %x[qp], %[maxval]\n"
+      ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x24]\n"
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "add z13.s, z13.s, z4.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "uzp1 z23.h, z23.h, z12.h\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "uzp1 z12.h, z13.h, z14.h\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "uzp1 z23.b, z23.b, z12.b\n"
+      "st1b { z23.b }, p1, [x9]\n"
+      "add z9.s, z9.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x23]\n"
+      "add z18.s, z18.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x22]\n"
+      "39:"  // Height 3: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 28b\n"
+      "b 80f\n"
+      "40:"  // Height 4
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x11, %x[col_bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "41:"  // Height 4: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x10\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "42:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "43:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 44f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 45f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 45f\n"
+      "44:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "45:"  // Height 4: input setup done
+      "cmp x26, #0x10\n"
+      "ble 47f\n"
+      "46:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "sub x26, x26, #0x10\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "cmp x26, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "add x22, x22, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      "bgt 46b\n"
+      "47:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x26, x26, #0x8\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      "ble 48f\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      "48:"  // Height 4: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 43b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "ld1w { z0.s }, p2/Z, [x11]\n"
+      "add x23, x9, x19\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x22, x23, x19\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
+      "add x21, x22, x19\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "addvl x11, x11, #4\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "mov z23.d, z7.d\n"
+      "add z23.s, z23.s, z0.s\n"
+      "add z12.s, z12.s, z1.s\n"
+      "add z13.s, z13.s, z2.s\n"
+      "add z14.s, z14.s, z3.s\n"
+      "add z8.s, z8.s, z0.s\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z15.s, z15.s, z0.s\n"
+      "add z20.s, z20.s, z1.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z3.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "tbz %x[flags], #4, 49f\n"
+      "ld1w { z0.s }, p2/Z, [x12]\n"
+      "ld1w { z4.s }, p2/Z, [x13]\n"
+      "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "b 50f\n"
+      "49:"  // Height 4: per layer parameters
+      "add x24, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x24]\n"
+      "mov z1.d, z0.d\n"
+      "add x24, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "50:"  // Height 4: parameters loaded
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a5758c  // sqrdmulh z12.s, z12.s, z5.s\n"
+      ".inst 0x04a675ad  // sqrdmulh z13.s, z13.s, z6.s\n"
+      ".inst 0x04a775ce  // sqrdmulh z14.s, z14.s, z7.s\n"
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a475ef  // sqrdmulh z15.s, z15.s, z4.s\n"
+      ".inst 0x04a57694  // sqrdmulh z20.s, z20.s, z5.s\n"
+      ".inst 0x04a676b5  // sqrdmulh z21.s, z21.s, z6.s\n"
+      ".inst 0x04a776d6  // sqrdmulh z22.s, z22.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      "tbz %x[flags], #5, 51f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "and z5.d, z12.d, z1.d\n"
+      "and z6.d, z13.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "sqadd z12.s, z12.s, z5.s\n"
+      "sqadd z13.s, z13.s, z6.s\n"
+      "and z7.d, z14.d, z3.d\n"
+      "and z4.d, z8.d, z0.d\n"
+      "and z5.d, z9.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z14.s, z14.s, z7.s\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "and z6.d, z10.d, z2.d\n"
+      "and z7.d, z11.d, z3.d\n"
+      "and z4.d, z15.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "sqadd z15.s, z15.s, z4.s\n"
+      "and z5.d, z20.d, z1.d\n"
+      "and z6.d, z21.d, z2.d\n"
+      "and z7.d, z22.d, z3.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z5.s\n"
+      "sqadd z21.s, z21.s, z6.s\n"
+      "sqadd z22.s, z22.s, z7.s\n"
+      "and z4.d, z16.d, z0.d\n"
+      "and z5.d, z17.d, z1.d\n"
+      "and z6.d, z18.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "and z7.d, z19.d, z3.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "51:"  // Height 4: no shift correction
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add x24, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
+      "add x24, %x[qp], %[minval]\n"
+      ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x24]\n"
+      "add x24, %x[qp], %[maxval]\n"
+      ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x24]\n"
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "add z13.s, z13.s, z4.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "uzp1 z23.h, z23.h, z12.h\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "uzp1 z12.h, z13.h, z14.h\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "uzp1 z23.b, z23.b, z12.b\n"
+      "st1b { z23.b }, p1, [x9]\n"
+      "add z9.s, z9.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      ".inst 0x44828834  // srshl z20.s, p2/M, z20.s, z1.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      ".inst 0x44828855  // srshl z21.s, p2/M, z21.s, z2.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      ".inst 0x44828876  // srshl z22.s, p2/M, z22.s, z3.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x23]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "uzp1 z15.h, z15.h, z20.h\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "uzp1 z15.b, z15.b, z20.b\n"
+      "st1b { z15.b }, p1, [x22]\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x21]\n"
+      "52:"  // Height 4: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 41b\n"
+      "b 80f\n"
+      "53:"  // Height 5
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x11, %x[col_bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "54:"  // Height 5: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x10\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "55:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "56:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 57f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 58f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "b 58f\n"
+      "57:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "58:"  // Height 5: input setup done
+      "cmp x26, #0x10\n"
+      "ble 60f\n"
+      "59:"  // Height 5: Multiply loop: Main loop head
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "cmp x26, #0x10\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "add x24, x24, #0x10\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "add x23, x23, #0x10\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "add x22, x22, #0x10\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      "bgt 59b\n"
+      "60:"  // Height 5: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "subs x26, x26, #0x8\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
+      "ble 61f\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      "61:"  // Height 5: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 56b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "ld1w { z0.s }, p2/Z, [x11]\n"
+      "add x23, x9, x19\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x22, x23, x19\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
+      "add x21, x22, x19\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "add x20, x21, x19\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "addvl x11, x11, #4\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "mov z31.d, z7.d\n"
+      "add z31.s, z31.s, z0.s\n"
+      "add z12.s, z12.s, z1.s\n"
+      "add z13.s, z13.s, z2.s\n"
+      "add z14.s, z14.s, z3.s\n"
+      "add z8.s, z8.s, z0.s\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z15.s, z15.s, z0.s\n"
+      "add z20.s, z20.s, z1.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z3.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      "tbz %x[flags], #4, 62f\n"
+      "ld1w { z0.s }, p2/Z, [x12]\n"
+      "ld1w { z4.s }, p2/Z, [x13]\n"
+      "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "b 63f\n"
+      "62:"  // Height 5: per layer parameters
+      "add x24, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x24]\n"
+      "mov z1.d, z0.d\n"
+      "add x24, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "63:"  // Height 5: parameters loaded
+      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      ".inst 0x04a5758c  // sqrdmulh z12.s, z12.s, z5.s\n"
+      ".inst 0x04a675ad  // sqrdmulh z13.s, z13.s, z6.s\n"
+      ".inst 0x04a775ce  // sqrdmulh z14.s, z14.s, z7.s\n"
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a475ef  // sqrdmulh z15.s, z15.s, z4.s\n"
+      ".inst 0x04a57694  // sqrdmulh z20.s, z20.s, z5.s\n"
+      ".inst 0x04a676b5  // sqrdmulh z21.s, z21.s, z6.s\n"
+      ".inst 0x04a776d6  // sqrdmulh z22.s, z22.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a57739  // sqrdmulh z25.s, z25.s, z5.s\n"
+      ".inst 0x04a6775a  // sqrdmulh z26.s, z26.s, z6.s\n"
+      ".inst 0x04a7777b  // sqrdmulh z27.s, z27.s, z7.s\n"
+      "tbz %x[flags], #5, 64f\n"
+      "and z4.d, z31.d, z0.d\n"
+      "and z5.d, z12.d, z1.d\n"
+      "and z6.d, z13.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z4.s\n"
+      "sqadd z12.s, z12.s, z5.s\n"
+      "sqadd z13.s, z13.s, z6.s\n"
+      "and z7.d, z14.d, z3.d\n"
+      "and z4.d, z8.d, z0.d\n"
+      "and z5.d, z9.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z14.s, z14.s, z7.s\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "and z6.d, z10.d, z2.d\n"
+      "and z7.d, z11.d, z3.d\n"
+      "and z4.d, z15.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "sqadd z15.s, z15.s, z4.s\n"
+      "and z5.d, z20.d, z1.d\n"
+      "and z6.d, z21.d, z2.d\n"
+      "and z7.d, z22.d, z3.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z5.s\n"
+      "sqadd z21.s, z21.s, z6.s\n"
+      "sqadd z22.s, z22.s, z7.s\n"
+      "and z4.d, z16.d, z0.d\n"
+      "and z5.d, z17.d, z1.d\n"
+      "and z6.d, z18.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "and z7.d, z19.d, z3.d\n"
+      "and z4.d, z24.d, z0.d\n"
+      "and z5.d, z25.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z24.s, z24.s, z4.s\n"
+      "sqadd z25.s, z25.s, z5.s\n"
+      "and z6.d, z26.d, z2.d\n"
+      "and z7.d, z27.d, z3.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z26.s, z26.s, z6.s\n"
+      "sqadd z27.s, z27.s, z7.s\n"
+      "64:"  // Height 5: no shift correction
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add x24, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
+      "add x24, %x[qp], %[minval]\n"
+      ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x24]\n"
+      "add x24, %x[qp], %[maxval]\n"
+      ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x24]\n"
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add z31.s, z31.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "add z13.s, z13.s, z4.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "smin z31.s, p2/M, z31.s, z6.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smax z31.s, p2/M, z31.s, z5.s\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "uzp1 z31.h, z31.h, z12.h\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "uzp1 z12.h, z13.h, z14.h\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "uzp1 z31.b, z31.b, z12.b\n"
+      "st1b { z31.b }, p1, [x9]\n"
+      "add z9.s, z9.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      ".inst 0x44828834  // srshl z20.s, p2/M, z20.s, z1.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      ".inst 0x44828855  // srshl z21.s, p2/M, z21.s, z2.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      ".inst 0x44828876  // srshl z22.s, p2/M, z22.s, z3.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x23]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "uzp1 z15.h, z15.h, z20.h\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "uzp1 z15.b, z15.b, z20.b\n"
+      "st1b { z15.b }, p1, [x22]\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      ".inst 0x44828839  // srshl z25.s, p2/M, z25.s, z1.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482885a  // srshl z26.s, p2/M, z26.s, z2.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "add z26.s, z26.s, z4.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      ".inst 0x4482887b  // srshl z27.s, p2/M, z27.s, z3.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x21]\n"
+      "add z27.s, z27.s, z4.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x20]\n"
+      "65:"  // Height 5: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 54b\n"
+      "b 80f\n"
+      "66:"  // Height 6
+      "ldr x13, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+      "mov x11, %x[col_bias]\n"
+      "ldr x12, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+      "mov x9, %x[output_ptr]\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x20, #0x6\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "67:"  // Height 6: Column loop
+      "mov z8.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "whilelt p1.b, x19, x10\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "68:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "69:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 70f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 71f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "add x20, x20, x19\n"
+      "b 71f\n"
+      "70:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "add x20, x21, x19\n"
+      "71:"  // Height 6: input setup done
+      "cmp x26, #0x10\n"
+      "ble 73f\n"
+      "72:"  // Height 6: Multiply loop: Main loop head
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "sub x26, x26, #0x10\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "cmp x26, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "ld1rqb { z6.b }, p0/Z, [x20]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      "add x20, x20, #0x10\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      "bgt 72b\n"
+      "73:"  // Height 6: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "subs x26, x26, #0x8\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1rqb { z6.b }, p0/Z, [x20]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
+      "ble 74f\n"
+      "ld1b { z7.b }, p2/Z, [x28]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      "74:"  // Height 6: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 69b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "ld1w { z0.s }, p2/Z, [x11]\n"
+      "add x23, x9, x19\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "ld1w { z1.s }, p2/Z, [x11, #1, MUL VL]\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "ld1w { z2.s }, p2/Z, [x11, #2, MUL VL]\n"
+      "add x22, x23, x19\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "ld1w { z3.s }, p2/Z, [x11, #3, MUL VL]\n"
+      "add x21, x22, x19\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "add x20, x21, x19\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "add x19, x20, x19\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "addvl x11, x11, #4\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "mov z31.d, z7.d\n"
+      "add z31.s, z31.s, z0.s\n"
+      "add z12.s, z12.s, z1.s\n"
+      "add z13.s, z13.s, z2.s\n"
+      "add z14.s, z14.s, z3.s\n"
+      "add z8.s, z8.s, z0.s\n"
+      "add z9.s, z9.s, z1.s\n"
+      "add z10.s, z10.s, z2.s\n"
+      "add z11.s, z11.s, z3.s\n"
+      "add z15.s, z15.s, z0.s\n"
+      "add z20.s, z20.s, z1.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z3.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z23.s, z23.s, z0.s\n"
+      "add z28.s, z28.s, z1.s\n"
+      "add z29.s, z29.s, z2.s\n"
+      "add z30.s, z30.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      "tbz %x[flags], #4, 75f\n"
+      "ld1w { z0.s }, p2/Z, [x12]\n"
+      "ld1w { z4.s }, p2/Z, [x13]\n"
+      "ld1w { z1.s }, p2/Z, [x12, #1, MUL VL]\n"
+      "ld1w { z5.s }, p2/Z, [x13, #1, MUL VL]\n"
+      "ld1w { z2.s }, p2/Z, [x12, #2, MUL VL]\n"
+      "ld1w { z6.s }, p2/Z, [x13, #2, MUL VL]\n"
+      "ld1w { z3.s }, p2/Z, [x12, #3, MUL VL]\n"
+      "addvl x12, x12, #4\n"
+      "ld1w { z7.s }, p2/Z, [x13, #3, MUL VL]\n"
+      "addvl x13, x13, #4\n"
+      "b 76f\n"
+      "75:"  // Height 6: per layer parameters
+      "add x24, %x[qp], %[per_layer_right_shift]\n"
+      "ld1rw { z0.s }, p2/Z, [x24]\n"
+      "mov z1.d, z0.d\n"
+      "add x24, %x[qp], %[per_layer_mul]\n"
+      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      "mov z2.d, z0.d\n"
+      "mov z3.d, z0.d\n"
+      "mov z5.d, z4.d\n"
+      "mov z6.d, z4.d\n"
+      "mov z7.d, z4.d\n"
+      "76:"  // Height 6: parameters loaded
+      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      ".inst 0x04a5758c  // sqrdmulh z12.s, z12.s, z5.s\n"
+      ".inst 0x04a675ad  // sqrdmulh z13.s, z13.s, z6.s\n"
+      ".inst 0x04a775ce  // sqrdmulh z14.s, z14.s, z7.s\n"
+      ".inst 0x04a47508  // sqrdmulh z8.s, z8.s, z4.s\n"
+      ".inst 0x04a57529  // sqrdmulh z9.s, z9.s, z5.s\n"
+      ".inst 0x04a6754a  // sqrdmulh z10.s, z10.s, z6.s\n"
+      ".inst 0x04a7756b  // sqrdmulh z11.s, z11.s, z7.s\n"
+      ".inst 0x04a475ef  // sqrdmulh z15.s, z15.s, z4.s\n"
+      ".inst 0x04a57694  // sqrdmulh z20.s, z20.s, z5.s\n"
+      ".inst 0x04a676b5  // sqrdmulh z21.s, z21.s, z6.s\n"
+      ".inst 0x04a776d6  // sqrdmulh z22.s, z22.s, z7.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a57631  // sqrdmulh z17.s, z17.s, z5.s\n"
+      ".inst 0x04a67652  // sqrdmulh z18.s, z18.s, z6.s\n"
+      ".inst 0x04a77673  // sqrdmulh z19.s, z19.s, z7.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a5779c  // sqrdmulh z28.s, z28.s, z5.s\n"
+      ".inst 0x04a677bd  // sqrdmulh z29.s, z29.s, z6.s\n"
+      ".inst 0x04a777de  // sqrdmulh z30.s, z30.s, z7.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a57739  // sqrdmulh z25.s, z25.s, z5.s\n"
+      ".inst 0x04a6775a  // sqrdmulh z26.s, z26.s, z6.s\n"
+      ".inst 0x04a7777b  // sqrdmulh z27.s, z27.s, z7.s\n"
+      "tbz %x[flags], #5, 77f\n"
+      "and z4.d, z31.d, z0.d\n"
+      "and z5.d, z12.d, z1.d\n"
+      "and z6.d, z13.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z4.s\n"
+      "sqadd z12.s, z12.s, z5.s\n"
+      "sqadd z13.s, z13.s, z6.s\n"
+      "and z7.d, z14.d, z3.d\n"
+      "and z4.d, z8.d, z0.d\n"
+      "and z5.d, z9.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z14.s, z14.s, z7.s\n"
+      "sqadd z8.s, z8.s, z4.s\n"
+      "sqadd z9.s, z9.s, z5.s\n"
+      "and z6.d, z10.d, z2.d\n"
+      "and z7.d, z11.d, z3.d\n"
+      "and z4.d, z15.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z10.s, z10.s, z6.s\n"
+      "sqadd z11.s, z11.s, z7.s\n"
+      "sqadd z15.s, z15.s, z4.s\n"
+      "and z5.d, z20.d, z1.d\n"
+      "and z6.d, z21.d, z2.d\n"
+      "and z7.d, z22.d, z3.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z20.s, z20.s, z5.s\n"
+      "sqadd z21.s, z21.s, z6.s\n"
+      "sqadd z22.s, z22.s, z7.s\n"
+      "and z4.d, z16.d, z0.d\n"
+      "and z5.d, z17.d, z1.d\n"
+      "and z6.d, z18.d, z2.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "and z7.d, z19.d, z3.d\n"
+      "and z4.d, z23.d, z0.d\n"
+      "and z5.d, z28.d, z1.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "sqadd z28.s, z28.s, z5.s\n"
+      "and z6.d, z29.d, z2.d\n"
+      "and z7.d, z30.d, z3.d\n"
+      "and z4.d, z24.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z29.s, z29.s, z6.s\n"
+      "sqadd z30.s, z30.s, z7.s\n"
+      "sqadd z24.s, z24.s, z4.s\n"
+      "and z5.d, z25.d, z1.d\n"
+      "and z6.d, z26.d, z2.d\n"
+      "and z7.d, z27.d, z3.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z5.s\n"
+      "sqadd z26.s, z26.s, z6.s\n"
+      "sqadd z27.s, z27.s, z7.s\n"
+      "77:"  // Height 6: no shift correction
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add x24, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x24]\n"
+      ".inst 0x4482882c  // srshl z12.s, p2/M, z12.s, z1.s\n"
+      "add x24, %x[qp], %[minval]\n"
+      ".inst 0x4482884d  // srshl z13.s, p2/M, z13.s, z2.s\n"
+      "ld1rw { z5.s }, p2/Z, [x24]\n"
+      "add x24, %x[qp], %[maxval]\n"
+      ".inst 0x4482886e  // srshl z14.s, p2/M, z14.s, z3.s\n"
+      "ld1rw { z6.s }, p2/Z, [x24]\n"
+      ".inst 0x44828808  // srshl z8.s, p2/M, z8.s, z0.s\n"
+      "add z31.s, z31.s, z4.s\n"
+      "add z12.s, z12.s, z4.s\n"
+      "add z13.s, z13.s, z4.s\n"
+      "add z14.s, z14.s, z4.s\n"
+      "add z8.s, z8.s, z4.s\n"
+      "smin z31.s, p2/M, z31.s, z6.s\n"
+      "smin z12.s, p2/M, z12.s, z6.s\n"
+      "smin z13.s, p2/M, z13.s, z6.s\n"
+      "smin z14.s, p2/M, z14.s, z6.s\n"
+      "smax z31.s, p2/M, z31.s, z5.s\n"
+      "smax z12.s, p2/M, z12.s, z5.s\n"
+      "smax z13.s, p2/M, z13.s, z5.s\n"
+      "smax z14.s, p2/M, z14.s, z5.s\n"
+      "smin z8.s, p2/M, z8.s, z6.s\n"
+      "uzp1 z31.h, z31.h, z12.h\n"
+      ".inst 0x44828829  // srshl z9.s, p2/M, z9.s, z1.s\n"
+      "uzp1 z12.h, z13.h, z14.h\n"
+      "smax z8.s, p2/M, z8.s, z5.s\n"
+      "uzp1 z31.b, z31.b, z12.b\n"
+      "st1b { z31.b }, p1, [x9]\n"
+      "add z9.s, z9.s, z4.s\n"
+      "addvl x9, x9, #1\n"
+      ".inst 0x4482884a  // srshl z10.s, p2/M, z10.s, z2.s\n"
+      ".inst 0x4482886b  // srshl z11.s, p2/M, z11.s, z3.s\n"
+      ".inst 0x4482880f  // srshl z15.s, p2/M, z15.s, z0.s\n"
+      "smin z9.s, p2/M, z9.s, z6.s\n"
+      ".inst 0x44828834  // srshl z20.s, p2/M, z20.s, z1.s\n"
+      "add z10.s, z10.s, z4.s\n"
+      "add z11.s, z11.s, z4.s\n"
+      "add z15.s, z15.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "smax z9.s, p2/M, z9.s, z5.s\n"
+      "smin z10.s, p2/M, z10.s, z6.s\n"
+      "smin z11.s, p2/M, z11.s, z6.s\n"
+      "smin z15.s, p2/M, z15.s, z6.s\n"
+      "uzp1 z8.h, z8.h, z9.h\n"
+      "smax z10.s, p2/M, z10.s, z5.s\n"
+      "smax z11.s, p2/M, z11.s, z5.s\n"
+      "smax z15.s, p2/M, z15.s, z5.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      ".inst 0x44828855  // srshl z21.s, p2/M, z21.s, z2.s\n"
+      "uzp1 z9.h, z10.h, z11.h\n"
+      ".inst 0x44828876  // srshl z22.s, p2/M, z22.s, z3.s\n"
+      "uzp1 z8.b, z8.b, z9.b\n"
+      "st1b { z8.b }, p1, [x23]\n"
+      "add z21.s, z21.s, z4.s\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "uzp1 z15.h, z15.h, z20.h\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      ".inst 0x44828831  // srshl z17.s, p2/M, z17.s, z1.s\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      ".inst 0x44828852  // srshl z18.s, p2/M, z18.s, z2.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "uzp1 z15.b, z15.b, z20.b\n"
+      "st1b { z15.b }, p1, [x22]\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      ".inst 0x44828873  // srshl z19.s, p2/M, z19.s, z3.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      ".inst 0x4482883c  // srshl z28.s, p2/M, z28.s, z1.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z28.s, z28.s, z4.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z28.s, p2/M, z28.s, z6.s\n"
+      ".inst 0x4482885d  // srshl z29.s, p2/M, z29.s, z2.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z28.s, p2/M, z28.s, z5.s\n"
+      "add z29.s, z29.s, z4.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      ".inst 0x4482887e  // srshl z30.s, p2/M, z30.s, z3.s\n"
+      "uzp1 z23.h, z23.h, z28.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x21]\n"
+      "add z30.s, z30.s, z4.s\n"
+      "smin z29.s, p2/M, z29.s, z6.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      ".inst 0x44828839  // srshl z25.s, p2/M, z25.s, z1.s\n"
+      "smin z30.s, p2/M, z30.s, z6.s\n"
+      "smax z29.s, p2/M, z29.s, z5.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "smax z30.s, p2/M, z30.s, z5.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482885a  // srshl z26.s, p2/M, z26.s, z2.s\n"
+      "uzp1 z28.h, z29.h, z30.h\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "uzp1 z23.b, z23.b, z28.b\n"
+      "st1b { z23.b }, p1, [x20]\n"
+      "add z26.s, z26.s, z4.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      ".inst 0x4482887b  // srshl z27.s, p2/M, z27.s, z3.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "add z27.s, z27.s, z4.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x19]\n"
+      "78:"  // Height 6: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 67b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 80f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 79f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "79:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "80:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
index 37258978d3..b8ca7c5456 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -22,9 +22,10 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#ifdef ARM_COMPUTE_ENABLE_SVE
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -42,7 +43,8 @@ void sve_hybrid_s8s32_dot_6x4VL( ARGLIST );
 class cls_sve_hybrid_s8s32_dot_6x4VL
 {
 public:
-    typedef int8_t operand_type;
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
     typedef int32_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,7 +70,36 @@ public:
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.56 };
+                case CPUModel::A510:
+                    return { 20.92 };
+                case CPUModel::V1:
+                    return { 62.24 };
+            }
+        }
+
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.60, 15.53, 0.62 };
+                case CPUModel::A510:
+                    return { 22.77, 3.90, 0.47 };
+                case CPUModel::V1:
+                    return { 62.97, 19.14, 0.92 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_s8s32_dot_6x4VL;
@@ -80,4 +111,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp
new file mode 100644
index 0000000000..e0fea96ef3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/a64fx.cpp
@@ -0,0 +1,1033 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8s32_dot_6x4VL_a64fx (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+    const int32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "ptrue p4.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 51f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 41f\n"
+      "beq 31f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 21f\n"
+      "beq 11f\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "tbz %x[flags], #0, 3f\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "b 4f\n"
+      "3:"  // Height 1: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 9f\n"
+      "8:"  // Height 1: Multiply loop: Main loop
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "add x25, x25, #0x4\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 8b\n"
+      "9:"  // Height 1: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "addvl x9, x9, #4\n"
+      "bne 5b\n"
+      "st1w { z8.s }, p3, [x28]\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "10:"  // Height 1: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 2b\n"
+      "b 62f\n"
+      "11:"  // Height 2
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "12:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "tbz %x[flags], #0, 13f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "b 14f\n"
+      "13:"  // Height 2: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "14:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "15:"  // Height 2: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 16f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 17f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "17:"  // Height 2: input setup done
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 19f\n"
+      "18:"  // Height 2: Multiply loop: Main loop
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x25, x25, #0x4\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "subs x26, x26, #0x4\n"
+      "add x24, x24, #0x4\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z15.s, z7.b, z1.b\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 18b\n"
+      "19:"  // Height 2: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "addvl x9, x9, #4\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z15.s, z7.b, z1.b\n"
+      "bne 15b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "st1w { z8.s }, p3, [x28]\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "20:"  // Height 2: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 12b\n"
+      "b 62f\n"
+      "21:"  // Height 3
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "22:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "tbz %x[flags], #0, 23f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "b 24f\n"
+      "23:"  // Height 3: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "24:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "25:"  // Height 3: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 26f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 27f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "b 27f\n"
+      "26:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "27:"  // Height 3: input setup done
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 29f\n"
+      "28:"  // Height 3: Multiply loop: Main loop
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x25, x25, #0x4\n"
+      "subs x26, x26, #0x4\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x24, x24, #0x4\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "add x23, x23, #0x4\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z18.s, z6.b, z2.b\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "sdot z15.s, z7.b, z1.b\n"
+      "sdot z19.s, z7.b, z2.b\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 28b\n"
+      "29:"  // Height 3: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x27, x27, #0x1\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "cmp x27, x19\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z18.s, z6.b, z2.b\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z15.s, z7.b, z1.b\n"
+      "sdot z19.s, z7.b, z2.b\n"
+      "bne 25b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "st1w { z8.s }, p3, [x28]\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22]\n"
+      "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+      "30:"  // Height 3: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 22b\n"
+      "b 62f\n"
+      "31:"  // Height 4
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "32:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "tbz %x[flags], #0, 33f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "b 34f\n"
+      "33:"  // Height 4: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "34:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "35:"  // Height 4: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 37f\n"
+      "36:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "37:"  // Height 4: input setup done
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 39f\n"
+      "38:"  // Height 4: Multiply loop: Main loop
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x25, x25, #0x4\n"
+      "subs x26, x26, #0x4\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x24, x24, #0x4\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "add x23, x23, #0x4\n"
+      "add x22, x22, #0x4\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "sdot z21.s, z7.b, z3.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z18.s, z6.b, z2.b\n"
+      "sdot z22.s, z6.b, z3.b\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z15.s, z7.b, z1.b\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "sdot z19.s, z7.b, z2.b\n"
+      "sdot z23.s, z7.b, z3.b\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 38b\n"
+      "39:"  // Height 4: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x27, x27, #0x1\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "cmp x27, x19\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "sdot z21.s, z7.b, z3.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z18.s, z6.b, z2.b\n"
+      "sdot z22.s, z6.b, z3.b\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z15.s, z7.b, z1.b\n"
+      "sdot z19.s, z7.b, z2.b\n"
+      "sdot z23.s, z7.b, z3.b\n"
+      "bne 35b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "st1w { z8.s }, p3, [x28]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22]\n"
+      "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x21]\n"
+      "st1w { z21.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x21, #3, MUL VL]\n"
+      "40:"  // Height 4: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 32b\n"
+      "b 62f\n"
+      "41:"  // Height 5
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "42:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "tbz %x[flags], #0, 43f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x20]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 44f\n"
+      "43:"  // Height 5: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "44:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "45:"  // Height 5: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 46f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 47f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "b 47f\n"
+      "46:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "47:"  // Height 5: input setup done
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1rw { z4.s }, p4/Z, [x21]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 49f\n"
+      "48:"  // Height 5: Multiply loop: Main loop
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x25, x25, #0x4\n"
+      "subs x26, x26, #0x4\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "sdot z24.s, z6.b, z4.b\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x22, x22, #0x4\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "add x21, x21, #0x4\n"
+      "sdot z21.s, z7.b, z3.b\n"
+      "sdot z25.s, z7.b, z4.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z18.s, z6.b, z2.b\n"
+      "sdot z22.s, z6.b, z3.b\n"
+      "sdot z26.s, z6.b, z4.b\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "sdot z15.s, z7.b, z1.b\n"
+      "sdot z19.s, z7.b, z2.b\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "sdot z23.s, z7.b, z3.b\n"
+      "sdot z27.s, z7.b, z4.b\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1rw { z4.s }, p4/Z, [x21]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 48b\n"
+      "49:"  // Height 5: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x27, x27, #0x1\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "cmp x27, x19\n"
+      "sdot z24.s, z6.b, z4.b\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "sdot z21.s, z7.b, z3.b\n"
+      "sdot z25.s, z7.b, z4.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z18.s, z6.b, z2.b\n"
+      "sdot z22.s, z6.b, z3.b\n"
+      "sdot z26.s, z6.b, z4.b\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z15.s, z7.b, z1.b\n"
+      "sdot z19.s, z7.b, z2.b\n"
+      "sdot z23.s, z7.b, z3.b\n"
+      "sdot z27.s, z7.b, z4.b\n"
+      "bne 45b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "st1w { z8.s }, p3, [x28]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22]\n"
+      "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x21]\n"
+      "st1w { z21.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x20]\n"
+      "st1w { z25.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x20, #3, MUL VL]\n"
+      "50:"  // Height 5: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 42b\n"
+      "b 62f\n"
+      "51:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x19, #0x18\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "52:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "tbz %x[flags], #0, 53f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "add x19, x20, x19, LSL #2\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x20]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z28.s }, p3/Z, [x19]\n"
+      "ld1w { z29.s }, p2/Z, [x19, #1, MUL VL]\n"
+      "ld1w { z30.s }, p1/Z, [x19, #2, MUL VL]\n"
+      "ld1w { z31.s }, p0/Z, [x19, #3, MUL VL]\n"
+      "b 54f\n"
+      "53:"  // Height 6: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "54:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "55:"  // Height 6: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 56f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 57f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "add x20, x20, x19\n"
+      "b 57f\n"
+      "56:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "add x20, x21, x19\n"
+      "57:"  // Height 6: input setup done
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1rw { z4.s }, p4/Z, [x21]\n"
+      "ld1rw { z5.s }, p4/Z, [x20]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 59f\n"
+      "58:"  // Height 6: Multiply loop: Main loop
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x25, x25, #0x4\n"
+      "subs x26, x26, #0x4\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "sdot z24.s, z6.b, z4.b\n"
+      "sdot z28.s, z6.b, z5.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x22, x22, #0x4\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "add x21, x21, #0x4\n"
+      "add x20, x20, #0x4\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "sdot z21.s, z7.b, z3.b\n"
+      "sdot z25.s, z7.b, z4.b\n"
+      "sdot z29.s, z7.b, z5.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z18.s, z6.b, z2.b\n"
+      "sdot z22.s, z6.b, z3.b\n"
+      "sdot z26.s, z6.b, z4.b\n"
+      "sdot z30.s, z6.b, z5.b\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z15.s, z7.b, z1.b\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "sdot z19.s, z7.b, z2.b\n"
+      "sdot z23.s, z7.b, z3.b\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "sdot z27.s, z7.b, z4.b\n"
+      "sdot z31.s, z7.b, z5.b\n"
+      "ld1rw { z4.s }, p4/Z, [x21]\n"
+      "ld1rw { z5.s }, p4/Z, [x20]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 58b\n"
+      "59:"  // Height 6: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "sdot z8.s, z6.b, z0.b\n"
+      "sdot z12.s, z6.b, z1.b\n"
+      "add x27, x27, #0x1\n"
+      "sdot z16.s, z6.b, z2.b\n"
+      "sdot z20.s, z6.b, z3.b\n"
+      "cmp x27, x19\n"
+      "sdot z24.s, z6.b, z4.b\n"
+      "sdot z28.s, z6.b, z5.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "sdot z9.s, z7.b, z0.b\n"
+      "sdot z13.s, z7.b, z1.b\n"
+      "sdot z17.s, z7.b, z2.b\n"
+      "sdot z21.s, z7.b, z3.b\n"
+      "sdot z25.s, z7.b, z4.b\n"
+      "sdot z29.s, z7.b, z5.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "sdot z10.s, z6.b, z0.b\n"
+      "sdot z14.s, z6.b, z1.b\n"
+      "sdot z18.s, z6.b, z2.b\n"
+      "sdot z22.s, z6.b, z3.b\n"
+      "sdot z26.s, z6.b, z4.b\n"
+      "sdot z30.s, z6.b, z5.b\n"
+      "sdot z11.s, z7.b, z0.b\n"
+      "sdot z15.s, z7.b, z1.b\n"
+      "sdot z19.s, z7.b, z2.b\n"
+      "sdot z23.s, z7.b, z3.b\n"
+      "sdot z27.s, z7.b, z4.b\n"
+      "sdot z31.s, z7.b, z5.b\n"
+      "bne 55b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "st1w { z8.s }, p3, [x28]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "add x19, x20, x19, LSL #2\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22]\n"
+      "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x21]\n"
+      "st1w { z21.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x20]\n"
+      "st1w { z25.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x20, #3, MUL VL]\n"
+      "st1w { z28.s }, p3, [x19]\n"
+      "st1w { z29.s }, p2, [x19, #1, MUL VL]\n"
+      "st1w { z30.s }, p1, [x19, #2, MUL VL]\n"
+      "st1w { z31.s }, p0, [x19, #3, MUL VL]\n"
+      "60:"  // Height 6: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 52b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 62f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "61:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "62:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
index 9cddee941e..dc5b7a33f4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
@@ -137,13 +137,12 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "cmp x26, #0x10\n"
+      "add x25, x25, #0x10\n"
       "sdot z10.s, z6.b, z0.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "sdot z11.s, z7.b, z0.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
       "sdot z8.s, z6.b, z0.b[1]\n"
@@ -178,7 +177,6 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
@@ -217,9 +215,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "sdot z10.s, z6.b, z0.b[3]\n"
       "sdot z11.s, z7.b, z0.b[3]\n"
       "10:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 5b\n"
       "st1w { z8.s }, p4, [x28]\n"
@@ -296,16 +293,14 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "cmp x26, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "sdot z10.s, z6.b, z0.b[0]\n"
       "sdot z14.s, z6.b, z1.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
@@ -356,9 +351,7 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
-      "add x24, x24, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
@@ -413,10 +406,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "sdot z11.s, z7.b, z0.b[3]\n"
       "sdot z15.s, z7.b, z1.b[3]\n"
       "21:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 16b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -511,21 +502,18 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "sdot z13.s, z7.b, z1.b[0]\n"
       "add x23, x23, #0x10\n"
       "sdot z16.s, z6.b, z2.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "cmp x26, #0x10\n"
-      "sdot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "sdot z17.s, z7.b, z2.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sdot z10.s, z6.b, z0.b[0]\n"
       "sdot z14.s, z6.b, z1.b[0]\n"
       "sdot z18.s, z6.b, z2.b[0]\n"
@@ -590,12 +578,9 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
-      "add x23, x23, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
       "sdot z16.s, z6.b, z2.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
@@ -665,11 +650,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "sdot z15.s, z7.b, z1.b[3]\n"
       "sdot z19.s, z7.b, z2.b[3]\n"
       "32:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 27b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -781,26 +763,22 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "sdot z16.s, z6.b, z2.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
       "sdot z20.s, z6.b, z3.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sdot z21.s, z7.b, z3.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "sdot z10.s, z6.b, z0.b[0]\n"
       "sdot z14.s, z6.b, z1.b[0]\n"
       "sdot z18.s, z6.b, z2.b[0]\n"
@@ -879,19 +857,15 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
-      "add x22, x22, #0x10\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
       "sdot z20.s, z6.b, z3.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
       "sdot z21.s, z7.b, z3.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
@@ -972,12 +946,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "sdot z19.s, z7.b, z2.b[3]\n"
       "sdot z23.s, z7.b, z3.b[3]\n"
       "43:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 38b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1106,32 +1076,27 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "sdot z16.s, z6.b, z2.b[0]\n"
       "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
       "add x21, x21, #0x10\n"
       "sdot z20.s, z6.b, z3.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
       "sdot z24.s, z6.b, z4.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sdot z21.s, z7.b, z3.b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "sdot z25.s, z7.b, z4.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
       "sdot z10.s, z6.b, z0.b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "sdot z14.s, z6.b, z1.b[0]\n"
       "sdot z18.s, z6.b, z2.b[0]\n"
       "sdot z22.s, z6.b, z3.b[0]\n"
@@ -1223,22 +1188,17 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
-      "add x21, x21, #0x10\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
       "sdot z20.s, z6.b, z3.b[0]\n"
       "sdot z24.s, z6.b, z4.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
       "sdot z21.s, z7.b, z3.b[0]\n"
       "sdot z25.s, z7.b, z4.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
@@ -1334,13 +1294,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "sdot z23.s, z7.b, z3.b[3]\n"
       "sdot z27.s, z7.b, z4.b[3]\n"
       "54:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 49b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1489,37 +1444,31 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "sdot z16.s, z6.b, z2.b[0]\n"
       "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
       "ld1rqb { z5.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       "sdot z20.s, z6.b, z3.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
       "add x20, x20, #0x10\n"
       "sdot z24.s, z6.b, z4.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
       "sdot z28.s, z6.b, z5.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sdot z21.s, z7.b, z3.b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "sdot z25.s, z7.b, z4.b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "sdot z29.s, z7.b, z5.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
       "sdot z10.s, z6.b, z0.b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "sdot z14.s, z6.b, z1.b[0]\n"
       "sdot z18.s, z6.b, z2.b[0]\n"
       "sdot z22.s, z6.b, z3.b[0]\n"
@@ -1625,25 +1574,19 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "sdot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "sdot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "sdot z16.s, z6.b, z2.b[0]\n"
+      "sdot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
       "sdot z13.s, z7.b, z1.b[0]\n"
       "ld1rqb { z5.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      "sdot z16.s, z6.b, z2.b[0]\n"
       "sdot z20.s, z6.b, z3.b[0]\n"
-      "add x20, x20, #0x10\n"
-      "sdot z17.s, z7.b, z2.b[0]\n"
       "sdot z24.s, z6.b, z4.b[0]\n"
       "sdot z28.s, z6.b, z5.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+      "sdot z17.s, z7.b, z2.b[0]\n"
       "sdot z21.s, z7.b, z3.b[0]\n"
       "sdot z25.s, z7.b, z4.b[0]\n"
       "sdot z29.s, z7.b, z5.b[0]\n"
@@ -1754,14 +1697,8 @@ void sve_hybrid_s8s32_dot_6x4VL (
       "sdot z27.s, z7.b, z4.b[3]\n"
       "sdot z31.s, z7.b, z5.b[3]\n"
       "65:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 60b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
new file mode 100644
index 0000000000..b88ef14f25
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL.hpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<int8_t>, \
+    size_t, size_t, \
+    const int8_t *, \
+    IndirectOutputArg<int32_t>, \
+    const int32_t *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_s8s32_mmla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8s32_mmla_6x4VL
+{
+public:
+    typedef int8_t lhs_operand_type;
+    typedef int8_t rhs_operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<int32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 54.42 };
+                case CPUModel::A510:
+                    return { 24.21 };
+                case CPUModel::V1:
+                    return { 104.92 };
+            }
+        }
+
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 54.99, 15.37, 0.62 };
+                case CPUModel::A510:
+                    return { 23.87, 3.89, 0.37 };
+                case CPUModel::V1:
+                    return { 107.63, 19.24, 0.92 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_s8s32_mmla_6x4VL;
+    cls_sve_hybrid_s8s32_mmla_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..c3abb203ca
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_mmla_6x4VL/generic.cpp
@@ -0,0 +1,1675 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8s32_mmla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+    size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+    const int32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const int8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 56f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 45f\n"
+      "beq 34f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 23f\n"
+      "beq 12f\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "tbz %x[flags], #0, 3f\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 4f\n"
+      "3:"  // Height 1: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x26, #0x10\n"
+      "ble 9f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "cmp x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      "bgt 8b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "subs x26, x26, #0x8\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      "ble 10f\n"
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      "10:"  // Height 1: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 5b\n"
+      "uzp1 z8.d, z8.d, z12.d\n"
+      "st1w { z8.s }, p4, [x28]\n"
+      "uzp1 z9.d, z9.d, z13.d\n"
+      "uzp1 z10.d, z10.d, z14.d\n"
+      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+      "uzp1 z11.d, z11.d, z15.d\n"
+      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "11:"  // Height 1: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 2b\n"
+      "b 68f\n"
+      "12:"  // Height 2
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "13:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "tbz %x[flags], #0, 14f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 15f\n"
+      "14:"  // Height 2: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "15:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "16:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 17f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 18f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "b 18f\n"
+      "17:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "18:"  // Height 2: input setup done
+      "cmp x26, #0x10\n"
+      "ble 20f\n"
+      "19:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "sub x26, x26, #0x10\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "cmp x26, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "add x25, x25, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      "bgt 19b\n"
+      "20:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "subs x26, x26, #0x8\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      "ble 21f\n"
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      "21:"  // Height 2: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 16b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "st1w { z7.s }, p4, [x28]\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x23]\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+      "22:"  // Height 2: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 13b\n"
+      "b 68f\n"
+      "23:"  // Height 3
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "24:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "tbz %x[flags], #0, 25f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 26f\n"
+      "25:"  // Height 3: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "26:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "27:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 28f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 29f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "b 29f\n"
+      "28:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "29:"  // Height 3: input setup done
+      "cmp x26, #0x10\n"
+      "ble 31f\n"
+      "30:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "sub x26, x26, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "cmp x26, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "add x23, x23, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      "bgt 30b\n"
+      "31:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x26, x26, #0x8\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      "ble 32f\n"
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      "32:"  // Height 3: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 27b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "st1w { z7.s }, p4, [x28]\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "addvl x28, x28, #4\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "st1w { z8.s }, p4, [x23]\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x22]\n"
+      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "33:"  // Height 3: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 24b\n"
+      "b 68f\n"
+      "34:"  // Height 4
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "35:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "tbz %x[flags], #0, 36f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 37f\n"
+      "36:"  // Height 4: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "37:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "38:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 39f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 40f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 40f\n"
+      "39:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "40:"  // Height 4: input setup done
+      "cmp x26, #0x10\n"
+      "ble 42f\n"
+      "41:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "sub x26, x26, #0x10\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "cmp x26, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "add x22, x22, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      "bgt 41b\n"
+      "42:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "subs x26, x26, #0x8\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      "ble 43f\n"
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      "43:"  // Height 4: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 38b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "st1w { z7.s }, p4, [x28]\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "addvl x28, x28, #4\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "st1w { z8.s }, p4, [x23]\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "st1w { z15.s }, p4, [x22]\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "st1w { z20.s }, p3, [x22, #1, MUL VL]\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "44:"  // Height 4: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 35b\n"
+      "b 68f\n"
+      "45:"  // Height 5
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "46:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "tbz %x[flags], #0, 47f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "ld1w { z6.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z6.d, z31.d\n"
+      "zip2 z31.d, z6.d, z31.d\n"
+      "b 48f\n"
+      "47:"  // Height 5: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "48:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "49:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 51f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "b 51f\n"
+      "50:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "51:"  // Height 5: input setup done
+      "cmp x26, #0x10\n"
+      "ble 53f\n"
+      "52:"  // Height 5: Multiply loop: Main loop head
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "cmp x26, #0x10\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "add x24, x24, #0x10\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "add x23, x23, #0x10\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "add x22, x22, #0x10\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      "bgt 52b\n"
+      "53:"  // Height 5: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "subs x26, x26, #0x8\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
+      "ble 54f\n"
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      "54:"  // Height 5: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 49b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "st1w { z7.s }, p4, [x28]\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "add x20, x21, x19, LSL #2\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "st1w { z8.s }, p4, [x23]\n"
+      "addvl x28, x28, #4\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "st1w { z15.s }, p4, [x22]\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "st1w { z20.s }, p3, [x22, #1, MUL VL]\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "st1w { z22.s }, p1, [x22, #3, MUL VL]\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x20]\n"
+      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
+      "55:"  // Height 5: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 46b\n"
+      "b 68f\n"
+      "56:"  // Height 6
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x20, #0x18\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "57:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "tbz %x[flags], #0, 58f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "add x19, x20, x19, LSL #2\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "ld1w { z6.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "ld1w { z28.s }, p4/Z, [x19]\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "ld1w { z29.s }, p3/Z, [x19, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x19, #2, MUL VL]\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "ld1w { z31.s }, p1/Z, [x19, #3, MUL VL]\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z6.d, z31.d\n"
+      "zip2 z31.d, z6.d, z31.d\n"
+      "b 59f\n"
+      "58:"  // Height 6: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "59:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "60:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 62f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "add x20, x20, x19\n"
+      "b 62f\n"
+      "61:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "add x20, x21, x19\n"
+      "62:"  // Height 6: input setup done
+      "cmp x26, #0x10\n"
+      "ble 64f\n"
+      "63:"  // Height 6: Multiply loop: Main loop head
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "sub x26, x26, #0x10\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "cmp x26, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "ld1rqb { z6.b }, p0/Z, [x20]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      "add x20, x20, #0x10\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      "bgt 63b\n"
+      "64:"  // Height 6: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "subs x26, x26, #0x8\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      ".inst 0x45079808  // smmla z8.s, z0.b, z7.b\n"
+      "ld1rqb { z6.b }, p0/Z, [x20]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45079850  // smmla z16.s, z2.b, z7.b\n"
+      ".inst 0x45079898  // smmla z24.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506980c  // smmla z12.s, z0.b, z6.b\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      ".inst 0x4506989c  // smmla z28.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079809  // smmla z9.s, z0.b, z7.b\n"
+      ".inst 0x45079851  // smmla z17.s, z2.b, z7.b\n"
+      ".inst 0x45079899  // smmla z25.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506980d  // smmla z13.s, z0.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x4506989d  // smmla z29.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507980a  // smmla z10.s, z0.b, z7.b\n"
+      ".inst 0x45079852  // smmla z18.s, z2.b, z7.b\n"
+      ".inst 0x4507989a  // smmla z26.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506980e  // smmla z14.s, z0.b, z6.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      ".inst 0x4506989e  // smmla z30.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079853  // smmla z19.s, z2.b, z7.b\n"
+      ".inst 0x4507989b  // smmla z27.s, z4.b, z7.b\n"
+      ".inst 0x4506980f  // smmla z15.s, z0.b, z6.b\n"
+      ".inst 0x45069857  // smmla z23.s, z2.b, z6.b\n"
+      ".inst 0x4506989f  // smmla z31.s, z4.b, z6.b\n"
+      "ble 65f\n"
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      ".inst 0x45079828  // smmla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45079870  // smmla z16.s, z3.b, z7.b\n"
+      ".inst 0x450798b8  // smmla z24.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x4506982c  // smmla z12.s, z1.b, z6.b\n"
+      ".inst 0x45069874  // smmla z20.s, z3.b, z6.b\n"
+      ".inst 0x450698bc  // smmla z28.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45079829  // smmla z9.s, z1.b, z7.b\n"
+      ".inst 0x45079871  // smmla z17.s, z3.b, z7.b\n"
+      ".inst 0x450798b9  // smmla z25.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x4506982d  // smmla z13.s, z1.b, z6.b\n"
+      ".inst 0x45069875  // smmla z21.s, z3.b, z6.b\n"
+      ".inst 0x450698bd  // smmla z29.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x4507982a  // smmla z10.s, z1.b, z7.b\n"
+      ".inst 0x45079872  // smmla z18.s, z3.b, z7.b\n"
+      ".inst 0x450798ba  // smmla z26.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      ".inst 0x45069876  // smmla z22.s, z3.b, z6.b\n"
+      ".inst 0x450698be  // smmla z30.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x4507982b  // smmla z11.s, z1.b, z7.b\n"
+      ".inst 0x45079873  // smmla z19.s, z3.b, z7.b\n"
+      ".inst 0x450798bb  // smmla z27.s, z5.b, z7.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069877  // smmla z23.s, z3.b, z6.b\n"
+      ".inst 0x450698bf  // smmla z31.s, z5.b, z6.b\n"
+      "65:"  // Height 6: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 60b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "st1w { z7.s }, p4, [x28]\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "add x20, x21, x19, LSL #2\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "st1w { z8.s }, p4, [x23]\n"
+      "add x19, x20, x19, LSL #2\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "st1w { z15.s }, p4, [x22]\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "st1w { z20.s }, p3, [x22, #1, MUL VL]\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "st1w { z22.s }, p1, [x22, #3, MUL VL]\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "st1w { z23.s }, p4, [x20]\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "st1w { z28.s }, p3, [x20, #1, MUL VL]\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "st1w { z29.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z30.s }, p1, [x20, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x19]\n"
+      "st1w { z25.s }, p3, [x19, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x19, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x19, #3, MUL VL]\n"
+      "66:"  // Height 6: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 57b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 68f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "67:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "68:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
index 3de8d178cd..c66ebedc4d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
@@ -22,9 +22,10 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#ifdef ARM_COMPUTE_ENABLE_SVE
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -42,7 +43,8 @@ void sve_hybrid_u8qa_dot_4x4VL( ARGLIST );
 class cls_sve_hybrid_u8qa_dot_4x4VL
 {
 public:
-    typedef uint8_t operand_type;
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
     typedef uint8_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,7 +70,22 @@ public:
         return false;
     }
 
-    StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 4, 4, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 29.89 };
+                case CPUModel::A510:
+                    return { 17.12 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_u8qa_dot_4x4VL;
@@ -80,4 +97,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
index 0bfc28776f..be6d5b901d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
@@ -158,7 +158,6 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "tbnz %x[flags], #31, 8f\n"
       "udot z11.s, z0.b, z15.b\n"
       "8:"  // Height 1: Multiply loop: unique 1: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sub x24, x24, #0x10\n"
       "cmp x24, #0x10\n"
       "bgt 7b\n"
@@ -170,7 +169,6 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "ld1rqb { z0.b }, p0/Z, [x23]\n"
       "udot z16.s, z4.b, z0.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x23, x23, #0x10\n"
       "udot z17.s, z5.b, z0.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
       "addvl x28, x28, #4\n"
@@ -212,9 +210,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "tbnz %x[flags], #31, 11f\n"
       "udot z11.s, z0.b, z15.b\n"
       "11:"  // Height 1: Multiply loop: unique 2: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
       "cmp x25, x19\n"
       "bne 4b\n"
       "tbnz %x[flags], #31, 12f\n"
@@ -251,16 +248,16 @@ void sve_hybrid_u8qa_dot_4x4VL (
       ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
       "tbz %x[flags], #5, 13f\n"
       "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z17.d, z0.d\n"
       "and z6.d, z18.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "and z7.d, z19.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "asr z6.s, z6.s, #0x1f\n"
       "sqadd z16.s, z16.s, z4.s\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z17.s, z17.s, z5.s\n"
       "sqadd z18.s, z18.s, z6.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "sqadd z19.s, z19.s, z7.s\n"
       "13:"  // Height 1: no shift correction
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
@@ -396,9 +393,7 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "udot z11.s, z0.b, z15.b\n"
       "udot z12.s, z1.b, z15.b\n"
       "22:"  // Height 2: Multiply loop: unique 3: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "cmp x24, #0x10\n"
       "bgt 21b\n"
       "23:"  // Height 2: Multiply loop: Single iteration only
@@ -409,12 +404,10 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "ld1rqb { z0.b }, p0/Z, [x23]\n"
       "udot z16.s, z4.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
       "udot z17.s, z5.b, z0.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x22, x22, #0x10\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "udot z20.s, z4.b, z1.b[0]\n"
       "addvl x28, x28, #4\n"
       "udot z21.s, z5.b, z1.b[0]\n"
       "udot z18.s, z6.b, z0.b[0]\n"
@@ -470,10 +463,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "udot z11.s, z0.b, z15.b\n"
       "udot z12.s, z1.b, z15.b\n"
       "25:"  // Height 2: Multiply loop: unique 4: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
       "cmp x25, x19\n"
       "bne 18b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -527,27 +518,27 @@ void sve_hybrid_u8qa_dot_4x4VL (
       ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
       "tbz %x[flags], #5, 27f\n"
       "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z17.d, z0.d\n"
       "and z6.d, z18.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
       "and z7.d, z19.d, z0.d\n"
       "and z8.d, z20.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "and z9.d, z21.d, z0.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z10.d, z22.d, z0.d\n"
       "asr z8.s, z8.s, #0x1f\n"
-      "and z4.d, z23.d, z0.d\n"
       "asr z9.s, z9.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z19.s, z19.s, z7.s\n"
       "sqadd z20.s, z20.s, z8.s\n"
       "sqadd z21.s, z21.s, z9.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "and z4.d, z23.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "sqadd z22.s, z22.s, z10.s\n"
       "sqadd z23.s, z23.s, z4.s\n"
       "27:"  // Height 2: no shift correction
@@ -731,11 +722,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "udot z12.s, z1.b, z15.b\n"
       "udot z13.s, z2.b, z15.b\n"
       "36:"  // Height 3: Multiply loop: unique 5: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "cmp x24, #0x10\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "bgt 35b\n"
       "37:"  // Height 3: Multiply loop: Single iteration only
       "ld1b { z4.b }, p2/Z, [x28]\n"
@@ -745,16 +733,13 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "ld1rqb { z0.b }, p0/Z, [x23]\n"
       "udot z16.s, z4.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
       "udot z17.s, z5.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x21, x21, #0x10\n"
-      "udot z24.s, z4.b, z2.b[0]\n"
+      "udot z20.s, z4.b, z1.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
       "addvl x28, x28, #4\n"
+      "udot z24.s, z4.b, z2.b[0]\n"
       "udot z21.s, z5.b, z1.b[0]\n"
       "udot z25.s, z5.b, z2.b[0]\n"
       "udot z18.s, z6.b, z0.b[0]\n"
@@ -825,11 +810,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "udot z12.s, z1.b, z15.b\n"
       "udot z13.s, z2.b, z15.b\n"
       "39:"  // Height 3: Multiply loop: unique 6: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
       "cmp x25, x19\n"
       "bne 32b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -899,39 +881,39 @@ void sve_hybrid_u8qa_dot_4x4VL (
       ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
       "tbz %x[flags], #5, 41f\n"
       "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z17.d, z0.d\n"
       "and z6.d, z18.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
       "and z7.d, z19.d, z0.d\n"
       "and z8.d, z20.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "and z9.d, z21.d, z0.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z10.d, z22.d, z0.d\n"
       "asr z8.s, z8.s, #0x1f\n"
-      "and z4.d, z23.d, z0.d\n"
       "asr z9.s, z9.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z24.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z19.s, z19.s, z7.s\n"
       "sqadd z20.s, z20.s, z8.s\n"
       "sqadd z21.s, z21.s, z9.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "and z4.d, z23.d, z0.d\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z22.s, z22.s, z10.s\n"
       "sqadd z23.s, z23.s, z4.s\n"
-      "and z6.d, z25.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z24.s, z24.s, z5.s\n"
+      "and z6.d, z25.d, z0.d\n"
       "and z7.d, z26.d, z0.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "and z8.d, z27.d, z0.d\n"
-      "sqadd z25.s, z25.s, z6.s\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z6.s\n"
       "sqadd z26.s, z26.s, z7.s\n"
       "sqadd z27.s, z27.s, z8.s\n"
       "41:"  // Height 3: no shift correction
@@ -1165,12 +1147,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "udot z13.s, z2.b, z15.b\n"
       "udot z14.s, z3.b, z15.b\n"
       "50:"  // Height 4: Multiply loop: unique 7: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
       "sub x24, x24, #0x10\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "cmp x24, #0x10\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "bgt 49b\n"
       "51:"  // Height 4: Multiply loop: Single iteration only
       "ld1b { z4.b }, p2/Z, [x28]\n"
@@ -1180,19 +1158,15 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "ld1rqb { z0.b }, p0/Z, [x23]\n"
       "udot z16.s, z4.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
       "udot z17.s, z5.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
-      "udot z20.s, z4.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
-      "udot z24.s, z4.b, z2.b[0]\n"
+      "udot z20.s, z4.b, z1.b[0]\n"
       "ld1b { z6.b }, p2/Z, [x28, #2, MUL VL]\n"
-      "add x20, x20, #0x10\n"
       "udot z21.s, z5.b, z1.b[0]\n"
       "ld1b { z7.b }, p2/Z, [x28, #3, MUL VL]\n"
       "addvl x28, x28, #4\n"
+      "udot z24.s, z4.b, z2.b[0]\n"
       "udot z28.s, z4.b, z3.b[0]\n"
       "udot z25.s, z5.b, z2.b[0]\n"
       "udot z29.s, z5.b, z3.b[0]\n"
@@ -1279,12 +1253,8 @@ void sve_hybrid_u8qa_dot_4x4VL (
       "udot z13.s, z2.b, z15.b\n"
       "udot z14.s, z3.b, z15.b\n"
       "53:"  // Height 4: Multiply loop: unique 8: skip row sum
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "add x25, x25, #0x1\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
       "cmp x25, x19\n"
       "bne 46b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1370,52 +1340,52 @@ void sve_hybrid_u8qa_dot_4x4VL (
       ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
       "tbz %x[flags], #5, 55f\n"
       "and z4.d, z16.d, z0.d\n"
-      "asr z4.s, z4.s, #0x1f\n"
       "and z5.d, z17.d, z0.d\n"
       "and z6.d, z18.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
       "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z16.s, z16.s, z4.s\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
       "and z7.d, z19.d, z0.d\n"
       "and z8.d, z20.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "and z9.d, z21.d, z0.d\n"
       "asr z7.s, z7.s, #0x1f\n"
-      "sqadd z16.s, z16.s, z4.s\n"
-      "and z10.d, z22.d, z0.d\n"
       "asr z8.s, z8.s, #0x1f\n"
-      "and z4.d, z23.d, z0.d\n"
       "asr z9.s, z9.s, #0x1f\n"
-      "sqadd z17.s, z17.s, z5.s\n"
-      "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z18.s, z18.s, z6.s\n"
-      "asr z4.s, z4.s, #0x1f\n"
-      "and z5.d, z24.d, z0.d\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z19.s, z19.s, z7.s\n"
       "sqadd z20.s, z20.s, z8.s\n"
       "sqadd z21.s, z21.s, z9.s\n"
+      "and z10.d, z22.d, z0.d\n"
+      "and z4.d, z23.d, z0.d\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z22.s, z22.s, z10.s\n"
       "sqadd z23.s, z23.s, z4.s\n"
-      "and z6.d, z25.d, z0.d\n"
-      "asr z6.s, z6.s, #0x1f\n"
       "sqadd z24.s, z24.s, z5.s\n"
+      "and z6.d, z25.d, z0.d\n"
       "and z7.d, z26.d, z0.d\n"
-      "asr z7.s, z7.s, #0x1f\n"
       "and z8.d, z27.d, z0.d\n"
-      "and z9.d, z28.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
       "asr z8.s, z8.s, #0x1f\n"
       "sqadd z25.s, z25.s, z6.s\n"
+      "sqadd z26.s, z26.s, z7.s\n"
+      "sqadd z27.s, z27.s, z8.s\n"
+      "and z9.d, z28.d, z0.d\n"
       "and z10.d, z29.d, z0.d\n"
-      "asr z9.s, z9.s, #0x1f\n"
       "and z4.d, z30.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
       "asr z10.s, z10.s, #0x1f\n"
-      "sqadd z26.s, z26.s, z7.s\n"
-      "and z5.d, z31.d, z0.d\n"
       "asr z4.s, z4.s, #0x1f\n"
-      "sqadd z27.s, z27.s, z8.s\n"
-      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z28.s, z28.s, z9.s\n"
       "sqadd z29.s, z29.s, z10.s\n"
       "sqadd z30.s, z30.s, z4.s\n"
+      "and z5.d, z31.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
       "sqadd z31.s, z31.s, z5.s\n"
       "55:"  // Height 4: no shift correction
       ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
@@ -1529,4 +1499,4 @@ void sve_hybrid_u8qa_dot_4x4VL (
 }
 
 } // namespace arm_gemm
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp
new file mode 100644
index 0000000000..da27554a0f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<uint8_t>, \
+    size_t, size_t, \
+    const uint8_t *, \
+    IndirectOutputArg<uint8_t>, \
+    const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_u8qa_mmla_4x4VL( ARGLIST );
+
+class cls_sve_hybrid_u8qa_mmla_4x4VL
+{
+public:
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
+    typedef uint8_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<uint32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return false;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 4, 8, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 47.30 };
+                case CPUModel::A510:
+                    return { 20.91 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_u8qa_mmla_4x4VL;
+    cls_sve_hybrid_u8qa_mmla_4x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp
new file mode 100644
index 0000000000..0f3f5e35e1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_mmla_4x4VL/generic.cpp
@@ -0,0 +1,1418 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8qa_mmla_4x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
+    const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    if (qp->c_offset > qp->minval) {
+        flags |= 0x20;
+    }
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x4\n"
+      "bge 43f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 29f\n"
+      "beq 15f\n"
+      "mov z11.s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov z15.b, #0x1\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x27, %x[col_bias]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov x26, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x9\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "3:"  // Height 1: setup done
+      "mov x25, #0x0\n"
+      "4:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 5f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "cbnz x25, 6f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "b 6f\n"
+      "5:"  // Height 1: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "6:"  // Height 1: input setup done
+      "cmp x24, #0x10\n"
+      "ble 9f\n"
+      "7:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
+      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
+      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
+      "tbnz %x[flags], #31, 8f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "8:"  // Height 1: Multiply loop: unique 1: skip row sum
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "bgt 7b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "subs x24, x24, #0x8\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
+      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
+      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
+      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
+      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
+      "ble 10f\n"
+      "ld1b { z6.b }, p2/Z, [x28]\n"
+      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
+      "10:"  // Height 1: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 11f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "11:"  // Height 1: Multiply loop: unique 2: skip row sum
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 4b\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "mov z23.d, z16.d\n"
+      "tbnz %x[flags], #31, 12f\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1rw { z1.s }, p2/Z, [x22]\n"
+      "mov z11.s, z11.s[0]\n"
+      "neg z1.s, p2/M, z1.s\n"
+      "mul z11.s, p2/M, z11.s, z1.s\n"
+      "12:"  // Height 1: skip row sum fixup
+      "add z23.s, z23.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x27]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z17.s, z17.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add z18.s, z18.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add z19.s, z19.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "add z23.s, z23.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x23]\n"
+      "add z17.s, z17.s, z1.s\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      "tbz %x[flags], #5, 13f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "and z5.d, z17.d, z0.d\n"
+      "and z6.d, z18.d, z0.d\n"
+      "and z7.d, z19.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "sqadd z17.s, z17.s, z5.s\n"
+      "sqadd z18.s, z18.s, z6.s\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "sqadd z19.s, z19.s, z7.s\n"
+      "13:"  // Height 1: no shift correction
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "add x22, %x[qp], %[minval]\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x22]\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z17.s, z17.s, z4.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "uzp1 z23.h, z23.h, z17.h\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z23.b, z23.b, z17.b\n"
+      "st1b { z23.b }, p1, [x26]\n"
+      "addvl x26, x26, #1\n"
+      "14:"  // Height 1: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 2b\n"
+      "b 58f\n"
+      "15:"  // Height 2
+      "mov z11.s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z15.b, #0x1\n"
+      "mov x26, %x[output_ptr]\n"
+      "16:"  // Height 2: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x9\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "17:"  // Height 2: setup done
+      "mov x25, #0x0\n"
+      "18:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 19f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "cbnz x25, 20f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 20f\n"
+      "19:"  // Height 2: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19\n"
+      "20:"  // Height 2: input setup done
+      "cmp x24, #0x10\n"
+      "ble 23f\n"
+      "21:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
+      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
+      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
+      "tbnz %x[flags], #31, 22f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "22:"  // Height 2: Multiply loop: unique 3: skip row sum
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "bgt 21b\n"
+      "23:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x24, x24, #0x8\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
+      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
+      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
+      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
+      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
+      "ble 24f\n"
+      "ld1b { z6.b }, p2/Z, [x28]\n"
+      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
+      "24:"  // Height 2: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 25f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "25:"  // Height 2: Multiply loop: unique 4: skip row sum
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 18b\n"
+      "uzp1 z7.d, z16.d, z20.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "add x21, x26, x19\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "mov z23.d, z7.d\n"
+      "tbnz %x[flags], #31, 26f\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1rw { z2.s }, p2/Z, [x22]\n"
+      "mov z12.s, z11.s[3]\n"
+      "mov z11.s, z11.s[0]\n"
+      "neg z2.s, p2/M, z2.s\n"
+      "mul z11.s, p2/M, z11.s, z2.s\n"
+      "mul z12.s, p2/M, z12.s, z2.s\n"
+      "26:"  // Height 2: skip row sum fixup
+      "add z23.s, z23.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x27]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z20.s, z20.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add z21.s, z21.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add z22.s, z22.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "add z16.s, z16.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      "add z17.s, z17.s, z12.s\n"
+      "add z18.s, z18.s, z12.s\n"
+      "add z19.s, z19.s, z12.s\n"
+      "add z23.s, z23.s, z0.s\n"
+      "add z20.s, z20.s, z1.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z3.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x23]\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      "tbz %x[flags], #5, 27f\n"
+      "and z4.d, z23.d, z0.d\n"
+      "and z5.d, z20.d, z0.d\n"
+      "and z6.d, z21.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z23.s, z23.s, z4.s\n"
+      "sqadd z20.s, z20.s, z5.s\n"
+      "sqadd z21.s, z21.s, z6.s\n"
+      "and z7.d, z22.d, z0.d\n"
+      "and z8.d, z16.d, z0.d\n"
+      "and z9.d, z17.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z22.s, z22.s, z7.s\n"
+      "sqadd z16.s, z16.s, z8.s\n"
+      "sqadd z17.s, z17.s, z9.s\n"
+      "and z10.d, z18.d, z0.d\n"
+      "and z4.d, z19.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z10.s\n"
+      "sqadd z19.s, z19.s, z4.s\n"
+      "27:"  // Height 2: no shift correction
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add x22, %x[qp], %[minval]\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x22]\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "add z21.s, z21.s, z4.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z23.h, z23.h, z20.h\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "uzp1 z23.b, z23.b, z20.b\n"
+      "st1b { z23.b }, p1, [x26]\n"
+      "add z17.s, z17.s, z4.s\n"
+      "addvl x26, x26, #1\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x21]\n"
+      "28:"  // Height 2: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 16b\n"
+      "b 58f\n"
+      "29:"  // Height 3
+      "mov z11.s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "mov x26, %x[output_ptr]\n"
+      "mov z15.b, #0x1\n"
+      "30:"  // Height 3: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x9\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "31:"  // Height 3: setup done
+      "mov x25, #0x0\n"
+      "32:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 33f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "cbnz x25, 34f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "b 34f\n"
+      "33:"  // Height 3: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "34:"  // Height 3: input setup done
+      "cmp x24, #0x10\n"
+      "ble 37f\n"
+      "35:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x21]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45c59858  // ummla z24.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x45c6985c  // ummla z28.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
+      ".inst 0x45c79859  // ummla z25.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
+      ".inst 0x45c8985d  // ummla z29.s, z2.b, z8.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
+      ".inst 0x45c9985a  // ummla z26.s, z2.b, z9.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
+      ".inst 0x45ca985e  // ummla z30.s, z2.b, z10.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
+      ".inst 0x45c4985b  // ummla z27.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
+      ".inst 0x45c5985f  // ummla z31.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
+      ".inst 0x45c69878  // ummla z24.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
+      ".inst 0x45c7987c  // ummla z28.s, z3.b, z7.b\n"
+      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
+      ".inst 0x45c89879  // ummla z25.s, z3.b, z8.b\n"
+      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
+      ".inst 0x45c9987d  // ummla z29.s, z3.b, z9.b\n"
+      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
+      ".inst 0x45ca987a  // ummla z26.s, z3.b, z10.b\n"
+      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
+      ".inst 0x45c4987e  // ummla z30.s, z3.b, z4.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c5987b  // ummla z27.s, z3.b, z5.b\n"
+      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
+      ".inst 0x45c6987f  // ummla z31.s, z3.b, z6.b\n"
+      "tbnz %x[flags], #31, 36f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "udot z13.s, z3.b, z15.b\n"
+      "36:"  // Height 3: Multiply loop: unique 5: skip row sum
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "bgt 35b\n"
+      "37:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "subs x24, x24, #0x8\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x21]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
+      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45c59858  // ummla z24.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45c6985c  // ummla z28.s, z2.b, z6.b\n"
+      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
+      ".inst 0x45c79859  // ummla z25.s, z2.b, z7.b\n"
+      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
+      ".inst 0x45c8985d  // ummla z29.s, z2.b, z8.b\n"
+      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
+      ".inst 0x45c9985a  // ummla z26.s, z2.b, z9.b\n"
+      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
+      ".inst 0x45ca985e  // ummla z30.s, z2.b, z10.b\n"
+      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
+      ".inst 0x45c4985b  // ummla z27.s, z2.b, z4.b\n"
+      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
+      ".inst 0x45c5985f  // ummla z31.s, z2.b, z5.b\n"
+      "ble 38f\n"
+      "ld1b { z6.b }, p2/Z, [x28]\n"
+      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45c69878  // ummla z24.s, z3.b, z6.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45c7987c  // ummla z28.s, z3.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45c89879  // ummla z25.s, z3.b, z8.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
+      ".inst 0x45c9987d  // ummla z29.s, z3.b, z9.b\n"
+      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
+      ".inst 0x45ca987a  // ummla z26.s, z3.b, z10.b\n"
+      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
+      ".inst 0x45c4987e  // ummla z30.s, z3.b, z4.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c5987b  // ummla z27.s, z3.b, z5.b\n"
+      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
+      ".inst 0x45c6987f  // ummla z31.s, z3.b, z6.b\n"
+      "38:"  // Height 3: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 39f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "udot z13.s, z3.b, z15.b\n"
+      "39:"  // Height 3: Multiply loop: unique 6: skip row sum
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 32b\n"
+      "uzp1 z7.d, z16.d, z20.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "add x21, x26, x19\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "add x20, x21, x19\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "mov z31.d, z7.d\n"
+      "tbnz %x[flags], #31, 40f\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1rw { z3.s }, p2/Z, [x22]\n"
+      ".inst 0x4491a9ad  // addp z13.s, p2/m, z13.s, z13.s\n"
+      "mov z12.s, z11.s[3]\n"
+      "mov z11.s, z11.s[0]\n"
+      "neg z3.s, p2/M, z3.s\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z3.s\n"
+      "mul z12.s, p2/M, z12.s, z3.s\n"
+      "mul z13.s, p2/M, z13.s, z3.s\n"
+      "40:"  // Height 3: skip row sum fixup
+      "add z31.s, z31.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x27]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z20.s, z20.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add z21.s, z21.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add z22.s, z22.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "add z16.s, z16.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      "add z17.s, z17.s, z12.s\n"
+      "add z18.s, z18.s, z12.s\n"
+      "add z19.s, z19.s, z12.s\n"
+      "add z24.s, z24.s, z13.s\n"
+      "add z25.s, z25.s, z13.s\n"
+      "add z26.s, z26.s, z13.s\n"
+      "add z27.s, z27.s, z13.s\n"
+      "add z31.s, z31.s, z0.s\n"
+      "add z20.s, z20.s, z1.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z3.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x23]\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
+      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
+      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "tbz %x[flags], #5, 41f\n"
+      "and z4.d, z31.d, z0.d\n"
+      "and z5.d, z20.d, z0.d\n"
+      "and z6.d, z21.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z4.s\n"
+      "sqadd z20.s, z20.s, z5.s\n"
+      "sqadd z21.s, z21.s, z6.s\n"
+      "and z7.d, z22.d, z0.d\n"
+      "and z8.d, z16.d, z0.d\n"
+      "and z9.d, z17.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z22.s, z22.s, z7.s\n"
+      "sqadd z16.s, z16.s, z8.s\n"
+      "sqadd z17.s, z17.s, z9.s\n"
+      "and z10.d, z18.d, z0.d\n"
+      "and z4.d, z19.d, z0.d\n"
+      "and z5.d, z24.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z10.s\n"
+      "sqadd z19.s, z19.s, z4.s\n"
+      "sqadd z24.s, z24.s, z5.s\n"
+      "and z6.d, z25.d, z0.d\n"
+      "and z7.d, z26.d, z0.d\n"
+      "and z8.d, z27.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z25.s, z25.s, z6.s\n"
+      "sqadd z26.s, z26.s, z7.s\n"
+      "sqadd z27.s, z27.s, z8.s\n"
+      "41:"  // Height 3: no shift correction
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add x22, %x[qp], %[minval]\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x22]\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z31.s, z31.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "add z21.s, z21.s, z4.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "smin z31.s, p2/M, z31.s, z6.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smax z31.s, p2/M, z31.s, z5.s\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z31.h, z31.h, z20.h\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "uzp1 z31.b, z31.b, z20.b\n"
+      "st1b { z31.b }, p1, [x26]\n"
+      "add z17.s, z17.s, z4.s\n"
+      "addvl x26, x26, #1\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x21]\n"
+      "add z26.s, z26.s, z4.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "add z27.s, z27.s, z4.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x20]\n"
+      "42:"  // Height 3: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 30b\n"
+      "b 58f\n"
+      "43:"  // Height 4
+      "mov z11.s, #0x0\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x27, %x[col_bias]\n"
+      "mov z12.s, #0x0\n"
+      "ldr x28, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "bic %x[flags], %x[flags], #0x80000000\n"
+      "mov z13.s, #0x0\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x26, %x[output_ptr]\n"
+      "mov z14.s, #0x0\n"
+      "mov x19, #0x4\n"
+      "mov z15.b, #0x1\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "44:"  // Height 4: Column loop
+      "mov z16.s, #0x0\n"
+      "mov x19, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "whilelt p1.b, x19, x9\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "45:"  // Height 4: setup done
+      "mov x25, #0x0\n"
+      "46:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w24, [x20, x25, LSL #0x2]\n"
+      "tbz %x[flags], #3, 47f\n"
+      "ldr x20, [%x[input_ptr], x25, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x23, [x20, #0x0]\n"
+      "ldr x22, [x20, #0x8]\n"
+      "ldr x21, [x20, #0x10]\n"
+      "ldr x20, [x20, #0x18]\n"
+      "cbnz x25, 48f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "add x20, x20, x19\n"
+      "b 48f\n"
+      "47:"  // Height 4: setup direct input
+      "mov x23, %x[input_ptr]\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "add x20, x21, x19\n"
+      "48:"  // Height 4: input setup done
+      "cmp x24, #0x10\n"
+      "ble 51f\n"
+      "49:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "add x23, x23, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x21]\n"
+      "add x22, x22, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x20]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x20, x20, #0x10\n"
+      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45c59858  // ummla z24.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #16\n"
+      ".inst 0x45c6985c  // ummla z28.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-8, MUL VL]\n"
+      ".inst 0x45c79859  // ummla z25.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #-7, MUL VL]\n"
+      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
+      ".inst 0x45c8985d  // ummla z29.s, z2.b, z8.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #-6, MUL VL]\n"
+      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
+      ".inst 0x45c9985a  // ummla z26.s, z2.b, z9.b\n"
+      "ld1b { z9.b }, p2/Z, [x28, #-5, MUL VL]\n"
+      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
+      ".inst 0x45ca985e  // ummla z30.s, z2.b, z10.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #-4, MUL VL]\n"
+      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
+      ".inst 0x45c4985b  // ummla z27.s, z2.b, z4.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #-3, MUL VL]\n"
+      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
+      ".inst 0x45c5985f  // ummla z31.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #-2, MUL VL]\n"
+      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
+      ".inst 0x45c69878  // ummla z24.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #-1, MUL VL]\n"
+      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
+      ".inst 0x45c7987c  // ummla z28.s, z3.b, z7.b\n"
+      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
+      ".inst 0x45c89879  // ummla z25.s, z3.b, z8.b\n"
+      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
+      ".inst 0x45c9987d  // ummla z29.s, z3.b, z9.b\n"
+      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
+      ".inst 0x45ca987a  // ummla z26.s, z3.b, z10.b\n"
+      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
+      ".inst 0x45c4987e  // ummla z30.s, z3.b, z4.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c5987b  // ummla z27.s, z3.b, z5.b\n"
+      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
+      ".inst 0x45c6987f  // ummla z31.s, z3.b, z6.b\n"
+      "tbnz %x[flags], #31, 50f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "udot z13.s, z3.b, z15.b\n"
+      "50:"  // Height 4: Multiply loop: unique 7: skip row sum
+      "sub x24, x24, #0x10\n"
+      "cmp x24, #0x10\n"
+      "bgt 49b\n"
+      "51:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z5.b }, p2/Z, [x28]\n"
+      "whilelt p0.b, XZR, x24\n"
+      "ld1b { z6.b }, p2/Z, [x28, #1, MUL VL]\n"
+      "subs x24, x24, #0x8\n"
+      "ld1rqb { z1.b }, p0/Z, [x23]\n"
+      "ld1rqb { z2.b }, p0/Z, [x22]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x21]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x20]\n"
+      "ld1b { z7.b }, p2/Z, [x28, #2, MUL VL]\n"
+      ".inst 0x45c59810  // ummla z16.s, z0.b, z5.b\n"
+      ".inst 0x45c69814  // ummla z20.s, z0.b, z6.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #3, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #4, MUL VL]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "ld1b { z10.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c79811  // ummla z17.s, z0.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45c59858  // ummla z24.s, z2.b, z5.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45c6985c  // ummla z28.s, z2.b, z6.b\n"
+      ".inst 0x45c79859  // ummla z25.s, z2.b, z7.b\n"
+      ".inst 0x45c89815  // ummla z21.s, z0.b, z8.b\n"
+      ".inst 0x45c8985d  // ummla z29.s, z2.b, z8.b\n"
+      ".inst 0x45c99812  // ummla z18.s, z0.b, z9.b\n"
+      ".inst 0x45c9985a  // ummla z26.s, z2.b, z9.b\n"
+      ".inst 0x45ca9816  // ummla z22.s, z0.b, z10.b\n"
+      ".inst 0x45ca985e  // ummla z30.s, z2.b, z10.b\n"
+      ".inst 0x45c49813  // ummla z19.s, z0.b, z4.b\n"
+      ".inst 0x45c4985b  // ummla z27.s, z2.b, z4.b\n"
+      ".inst 0x45c59817  // ummla z23.s, z0.b, z5.b\n"
+      ".inst 0x45c5985f  // ummla z31.s, z2.b, z5.b\n"
+      "ble 52f\n"
+      "ld1b { z6.b }, p2/Z, [x28]\n"
+      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
+      "ld1b { z7.b }, p2/Z, [x28, #1, MUL VL]\n"
+      ".inst 0x45c69878  // ummla z24.s, z3.b, z6.b\n"
+      "ld1b { z8.b }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1b { z9.b }, p2/Z, [x28, #3, MUL VL]\n"
+      ".inst 0x45c79834  // ummla z20.s, z1.b, z7.b\n"
+      "ld1b { z10.b }, p2/Z, [x28, #4, MUL VL]\n"
+      ".inst 0x45c7987c  // ummla z28.s, z3.b, z7.b\n"
+      "ld1b { z4.b }, p2/Z, [x28, #5, MUL VL]\n"
+      ".inst 0x45c89831  // ummla z17.s, z1.b, z8.b\n"
+      "ld1b { z5.b }, p2/Z, [x28, #6, MUL VL]\n"
+      ".inst 0x45c89879  // ummla z25.s, z3.b, z8.b\n"
+      "ld1b { z6.b }, p2/Z, [x28, #7, MUL VL]\n"
+      "addvl x28, x28, #8\n"
+      ".inst 0x45c99835  // ummla z21.s, z1.b, z9.b\n"
+      ".inst 0x45c9987d  // ummla z29.s, z3.b, z9.b\n"
+      ".inst 0x45ca9832  // ummla z18.s, z1.b, z10.b\n"
+      ".inst 0x45ca987a  // ummla z26.s, z3.b, z10.b\n"
+      ".inst 0x45c49836  // ummla z22.s, z1.b, z4.b\n"
+      ".inst 0x45c4987e  // ummla z30.s, z3.b, z4.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c5987b  // ummla z27.s, z3.b, z5.b\n"
+      ".inst 0x45c69837  // ummla z23.s, z1.b, z6.b\n"
+      ".inst 0x45c6987f  // ummla z31.s, z3.b, z6.b\n"
+      "52:"  // Height 4: Multiply loop: multiply skip
+      "tbnz %x[flags], #31, 53f\n"
+      "udot z11.s, z0.b, z15.b\n"
+      "udot z13.s, z2.b, z15.b\n"
+      "udot z11.s, z1.b, z15.b\n"
+      "udot z13.s, z3.b, z15.b\n"
+      "53:"  // Height 4: Multiply loop: unique 8: skip row sum
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x25, x25, #0x1\n"
+      "cmp x25, x19\n"
+      "bne 46b\n"
+      "uzp1 z7.d, z16.d, z20.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "add x21, x26, x19\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "add x20, x21, x19\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "add x19, x20, x19\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "mov z31.d, z7.d\n"
+      "tbnz %x[flags], #31, 54f\n"
+      ".inst 0x4491a96b  // addp z11.s, p2/m, z11.s, z11.s\n"
+      "add x22, %x[qp], %[b_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      ".inst 0x4491a9ad  // addp z13.s, p2/m, z13.s, z13.s\n"
+      "mov z12.s, z11.s[3]\n"
+      "mov z11.s, z11.s[0]\n"
+      "neg z4.s, p2/M, z4.s\n"
+      "mov z14.s, z13.s[3]\n"
+      "mov z13.s, z13.s[0]\n"
+      "mul z11.s, p2/M, z11.s, z4.s\n"
+      "mul z12.s, p2/M, z12.s, z4.s\n"
+      "mul z13.s, p2/M, z13.s, z4.s\n"
+      "mul z14.s, p2/M, z14.s, z4.s\n"
+      "54:"  // Height 4: skip row sum fixup
+      "add z31.s, z31.s, z11.s\n"
+      "ld1w { z0.s }, p2/Z, [x27]\n"
+      "orr %x[flags], %x[flags], #0x80000000\n"
+      "add z20.s, z20.s, z11.s\n"
+      "ld1w { z1.s }, p2/Z, [x27, #1, MUL VL]\n"
+      "add x23, %x[qp], %[per_layer_right_shift]\n"
+      "add z21.s, z21.s, z11.s\n"
+      "ld1w { z2.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "add x22, %x[qp], %[per_layer_mul]\n"
+      "add z22.s, z22.s, z11.s\n"
+      "ld1w { z3.s }, p2/Z, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "add z16.s, z16.s, z12.s\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      "add z17.s, z17.s, z12.s\n"
+      "add z18.s, z18.s, z12.s\n"
+      "add z19.s, z19.s, z12.s\n"
+      "add z23.s, z23.s, z13.s\n"
+      "add z28.s, z28.s, z13.s\n"
+      "add z29.s, z29.s, z13.s\n"
+      "add z30.s, z30.s, z13.s\n"
+      "add z24.s, z24.s, z14.s\n"
+      "add z25.s, z25.s, z14.s\n"
+      "add z26.s, z26.s, z14.s\n"
+      "add z27.s, z27.s, z14.s\n"
+      "add z31.s, z31.s, z0.s\n"
+      "add z20.s, z20.s, z1.s\n"
+      "add z21.s, z21.s, z2.s\n"
+      "add z22.s, z22.s, z3.s\n"
+      "add z16.s, z16.s, z0.s\n"
+      "add z17.s, z17.s, z1.s\n"
+      "add z18.s, z18.s, z2.s\n"
+      "add z19.s, z19.s, z3.s\n"
+      "add z23.s, z23.s, z0.s\n"
+      "add z28.s, z28.s, z1.s\n"
+      "add z29.s, z29.s, z2.s\n"
+      "add z30.s, z30.s, z3.s\n"
+      "add z24.s, z24.s, z0.s\n"
+      "ld1rw { z0.s }, p2/Z, [x23]\n"
+      "add z25.s, z25.s, z1.s\n"
+      "add z26.s, z26.s, z2.s\n"
+      "add z27.s, z27.s, z3.s\n"
+      ".inst 0x04a477ff  // sqrdmulh z31.s, z31.s, z4.s\n"
+      ".inst 0x04a47694  // sqrdmulh z20.s, z20.s, z4.s\n"
+      ".inst 0x04a476b5  // sqrdmulh z21.s, z21.s, z4.s\n"
+      ".inst 0x04a476d6  // sqrdmulh z22.s, z22.s, z4.s\n"
+      ".inst 0x04a47610  // sqrdmulh z16.s, z16.s, z4.s\n"
+      ".inst 0x04a47631  // sqrdmulh z17.s, z17.s, z4.s\n"
+      ".inst 0x04a47652  // sqrdmulh z18.s, z18.s, z4.s\n"
+      ".inst 0x04a47673  // sqrdmulh z19.s, z19.s, z4.s\n"
+      ".inst 0x04a476f7  // sqrdmulh z23.s, z23.s, z4.s\n"
+      ".inst 0x04a4779c  // sqrdmulh z28.s, z28.s, z4.s\n"
+      ".inst 0x04a477bd  // sqrdmulh z29.s, z29.s, z4.s\n"
+      ".inst 0x04a477de  // sqrdmulh z30.s, z30.s, z4.s\n"
+      ".inst 0x04a47718  // sqrdmulh z24.s, z24.s, z4.s\n"
+      ".inst 0x04a47739  // sqrdmulh z25.s, z25.s, z4.s\n"
+      ".inst 0x04a4775a  // sqrdmulh z26.s, z26.s, z4.s\n"
+      ".inst 0x04a4777b  // sqrdmulh z27.s, z27.s, z4.s\n"
+      "tbz %x[flags], #5, 55f\n"
+      "and z4.d, z31.d, z0.d\n"
+      "and z5.d, z20.d, z0.d\n"
+      "and z6.d, z21.d, z0.d\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "sqadd z31.s, z31.s, z4.s\n"
+      "sqadd z20.s, z20.s, z5.s\n"
+      "sqadd z21.s, z21.s, z6.s\n"
+      "and z7.d, z22.d, z0.d\n"
+      "and z8.d, z16.d, z0.d\n"
+      "and z9.d, z17.d, z0.d\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "sqadd z22.s, z22.s, z7.s\n"
+      "sqadd z16.s, z16.s, z8.s\n"
+      "sqadd z17.s, z17.s, z9.s\n"
+      "and z10.d, z18.d, z0.d\n"
+      "and z4.d, z19.d, z0.d\n"
+      "and z5.d, z23.d, z0.d\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z18.s, z18.s, z10.s\n"
+      "sqadd z19.s, z19.s, z4.s\n"
+      "sqadd z23.s, z23.s, z5.s\n"
+      "and z6.d, z28.d, z0.d\n"
+      "and z7.d, z29.d, z0.d\n"
+      "and z8.d, z30.d, z0.d\n"
+      "asr z6.s, z6.s, #0x1f\n"
+      "asr z7.s, z7.s, #0x1f\n"
+      "asr z8.s, z8.s, #0x1f\n"
+      "sqadd z28.s, z28.s, z6.s\n"
+      "sqadd z29.s, z29.s, z7.s\n"
+      "sqadd z30.s, z30.s, z8.s\n"
+      "and z9.d, z24.d, z0.d\n"
+      "and z10.d, z25.d, z0.d\n"
+      "and z4.d, z26.d, z0.d\n"
+      "asr z9.s, z9.s, #0x1f\n"
+      "asr z10.s, z10.s, #0x1f\n"
+      "asr z4.s, z4.s, #0x1f\n"
+      "sqadd z24.s, z24.s, z9.s\n"
+      "sqadd z25.s, z25.s, z10.s\n"
+      "sqadd z26.s, z26.s, z4.s\n"
+      "and z5.d, z27.d, z0.d\n"
+      "asr z5.s, z5.s, #0x1f\n"
+      "sqadd z27.s, z27.s, z5.s\n"
+      "55:"  // Height 4: no shift correction
+      ".inst 0x4482881f  // srshl z31.s, p2/M, z31.s, z0.s\n"
+      "add x22, %x[qp], %[c_offset]\n"
+      "ld1rw { z4.s }, p2/Z, [x22]\n"
+      ".inst 0x44828814  // srshl z20.s, p2/M, z20.s, z0.s\n"
+      "add x22, %x[qp], %[minval]\n"
+      ".inst 0x44828815  // srshl z21.s, p2/M, z21.s, z0.s\n"
+      "ld1rw { z5.s }, p2/Z, [x22]\n"
+      "add x22, %x[qp], %[maxval]\n"
+      ".inst 0x44828816  // srshl z22.s, p2/M, z22.s, z0.s\n"
+      "ld1rw { z6.s }, p2/Z, [x22]\n"
+      ".inst 0x44828810  // srshl z16.s, p2/M, z16.s, z0.s\n"
+      "add z31.s, z31.s, z4.s\n"
+      "add z20.s, z20.s, z4.s\n"
+      "add z21.s, z21.s, z4.s\n"
+      "add z22.s, z22.s, z4.s\n"
+      "add z16.s, z16.s, z4.s\n"
+      "smin z31.s, p2/M, z31.s, z6.s\n"
+      "smin z20.s, p2/M, z20.s, z6.s\n"
+      "smin z21.s, p2/M, z21.s, z6.s\n"
+      "smin z22.s, p2/M, z22.s, z6.s\n"
+      "smax z31.s, p2/M, z31.s, z5.s\n"
+      "smax z20.s, p2/M, z20.s, z5.s\n"
+      "smax z21.s, p2/M, z21.s, z5.s\n"
+      "smax z22.s, p2/M, z22.s, z5.s\n"
+      "smin z16.s, p2/M, z16.s, z6.s\n"
+      "uzp1 z31.h, z31.h, z20.h\n"
+      ".inst 0x44828811  // srshl z17.s, p2/M, z17.s, z0.s\n"
+      "uzp1 z20.h, z21.h, z22.h\n"
+      "smax z16.s, p2/M, z16.s, z5.s\n"
+      "uzp1 z31.b, z31.b, z20.b\n"
+      "st1b { z31.b }, p1, [x26]\n"
+      "add z17.s, z17.s, z4.s\n"
+      "addvl x26, x26, #1\n"
+      ".inst 0x44828812  // srshl z18.s, p2/M, z18.s, z0.s\n"
+      ".inst 0x44828813  // srshl z19.s, p2/M, z19.s, z0.s\n"
+      ".inst 0x44828817  // srshl z23.s, p2/M, z23.s, z0.s\n"
+      "smin z17.s, p2/M, z17.s, z6.s\n"
+      ".inst 0x4482881c  // srshl z28.s, p2/M, z28.s, z0.s\n"
+      "add z18.s, z18.s, z4.s\n"
+      "add z19.s, z19.s, z4.s\n"
+      "add z23.s, z23.s, z4.s\n"
+      "add z28.s, z28.s, z4.s\n"
+      "smax z17.s, p2/M, z17.s, z5.s\n"
+      "smin z18.s, p2/M, z18.s, z6.s\n"
+      "smin z19.s, p2/M, z19.s, z6.s\n"
+      "smin z23.s, p2/M, z23.s, z6.s\n"
+      "uzp1 z16.h, z16.h, z17.h\n"
+      "smax z18.s, p2/M, z18.s, z5.s\n"
+      "smax z19.s, p2/M, z19.s, z5.s\n"
+      "smax z23.s, p2/M, z23.s, z5.s\n"
+      "smin z28.s, p2/M, z28.s, z6.s\n"
+      ".inst 0x4482881d  // srshl z29.s, p2/M, z29.s, z0.s\n"
+      "uzp1 z17.h, z18.h, z19.h\n"
+      ".inst 0x4482881e  // srshl z30.s, p2/M, z30.s, z0.s\n"
+      "uzp1 z16.b, z16.b, z17.b\n"
+      "st1b { z16.b }, p1, [x21]\n"
+      "add z29.s, z29.s, z4.s\n"
+      "smax z28.s, p2/M, z28.s, z5.s\n"
+      "add z30.s, z30.s, z4.s\n"
+      ".inst 0x44828818  // srshl z24.s, p2/M, z24.s, z0.s\n"
+      "smin z29.s, p2/M, z29.s, z6.s\n"
+      "uzp1 z23.h, z23.h, z28.h\n"
+      "smin z30.s, p2/M, z30.s, z6.s\n"
+      "add z24.s, z24.s, z4.s\n"
+      "smax z29.s, p2/M, z29.s, z5.s\n"
+      ".inst 0x44828819  // srshl z25.s, p2/M, z25.s, z0.s\n"
+      "smax z30.s, p2/M, z30.s, z5.s\n"
+      "smin z24.s, p2/M, z24.s, z6.s\n"
+      ".inst 0x4482881a  // srshl z26.s, p2/M, z26.s, z0.s\n"
+      "add z25.s, z25.s, z4.s\n"
+      "uzp1 z28.h, z29.h, z30.h\n"
+      "smax z24.s, p2/M, z24.s, z5.s\n"
+      "add z26.s, z26.s, z4.s\n"
+      "uzp1 z23.b, z23.b, z28.b\n"
+      "st1b { z23.b }, p1, [x20]\n"
+      "smin z25.s, p2/M, z25.s, z6.s\n"
+      "smin z26.s, p2/M, z26.s, z6.s\n"
+      ".inst 0x4482881b  // srshl z27.s, p2/M, z27.s, z0.s\n"
+      "smax z25.s, p2/M, z25.s, z5.s\n"
+      "smax z26.s, p2/M, z26.s, z5.s\n"
+      "add z27.s, z27.s, z4.s\n"
+      "uzp1 z24.h, z24.h, z25.h\n"
+      "smin z27.s, p2/M, z27.s, z6.s\n"
+      "smax z27.s, p2/M, z27.s, z5.s\n"
+      "uzp1 z25.h, z26.h, z27.h\n"
+      "uzp1 z24.b, z24.b, z25.b\n"
+      "st1b { z24.b }, p1, [x19]\n"
+      "56:"  // Height 4: Writeback done
+      "decw x9, ALL, MUL #4\n"
+      "cmp x9, XZR\n"
+      "bgt 44b\n"
+      "subs %x[M], %x[M], #0x4\n"
+      "beq 58f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 57f\n"
+      "add x20, x20, #0x4\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "57:"  // Update direct input
+      "mov x19, #0x4\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "58:"  // Exit
+
+      : [M] "+&r" (M), [flags] "+&r" (flags), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
index a2883bfa30..d870711c6e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,10 @@
  * IN THE SOFTWARE.
  */
 #pragma once
-#ifdef ARM_COMPUTE_ENABLE_SVE
 
+#ifdef ARM_COMPUTE_ENABLE_SVE
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
 #define ARGLIST  \
     unsigned int, const unsigned int *, \
@@ -42,7 +43,8 @@ void sve_hybrid_u8u32_dot_6x4VL( ARGLIST );
 class cls_sve_hybrid_u8u32_dot_6x4VL
 {
 public:
-    typedef uint8_t operand_type;
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
     typedef uint32_t result_type;
 
     typedef void (*kern_type)( ARGLIST );
@@ -68,7 +70,36 @@ public:
         return true;
     }
 
-    StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 4, 4> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.56 };
+                case CPUModel::A510:
+                    return { 20.98 };
+                case CPUModel::V1:
+                    return { 62.19 };
+            }
+        }
+
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.59, 15.67, 0.61 };
+                case CPUModel::A510:
+                    return { 22.75, 3.90, 0.47 };
+                case CPUModel::V1:
+                    return { 62.97, 19.27, 0.92 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_u8u32_dot_6x4VL;
@@ -80,4 +111,5 @@ public:
 } // namespace arm_gemm
 
 #undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp
new file mode 100644
index 0000000000..11f9165a3f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/a64fx.cpp
@@ -0,0 +1,1033 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8u32_dot_6x4VL_a64fx (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+    const uint32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "ptrue p4.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 51f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 41f\n"
+      "beq 31f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 21f\n"
+      "beq 11f\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "tbz %x[flags], #0, 3f\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "b 4f\n"
+      "3:"  // Height 1: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 9f\n"
+      "8:"  // Height 1: Multiply loop: Main loop
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "add x25, x25, #0x4\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 8b\n"
+      "9:"  // Height 1: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "addvl x9, x9, #4\n"
+      "bne 5b\n"
+      "st1w { z8.s }, p3, [x28]\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "10:"  // Height 1: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 2b\n"
+      "b 62f\n"
+      "11:"  // Height 2
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "12:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "tbz %x[flags], #0, 13f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "b 14f\n"
+      "13:"  // Height 2: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "14:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "15:"  // Height 2: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 16f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 17f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "b 17f\n"
+      "16:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "17:"  // Height 2: input setup done
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 19f\n"
+      "18:"  // Height 2: Multiply loop: Main loop
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x25, x25, #0x4\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "subs x26, x26, #0x4\n"
+      "add x24, x24, #0x4\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "udot z15.s, z7.b, z1.b\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 18b\n"
+      "19:"  // Height 2: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "addvl x9, x9, #4\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "udot z15.s, z7.b, z1.b\n"
+      "bne 15b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "st1w { z8.s }, p3, [x28]\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "20:"  // Height 2: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 12b\n"
+      "b 62f\n"
+      "21:"  // Height 3
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "22:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "tbz %x[flags], #0, 23f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "b 24f\n"
+      "23:"  // Height 3: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "24:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "25:"  // Height 3: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 26f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 27f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "b 27f\n"
+      "26:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "27:"  // Height 3: input setup done
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 29f\n"
+      "28:"  // Height 3: Multiply loop: Main loop
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x25, x25, #0x4\n"
+      "subs x26, x26, #0x4\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x24, x24, #0x4\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "add x23, x23, #0x4\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "udot z18.s, z6.b, z2.b\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "udot z15.s, z7.b, z1.b\n"
+      "udot z19.s, z7.b, z2.b\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 28b\n"
+      "29:"  // Height 3: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x27, x27, #0x1\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "cmp x27, x19\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "udot z18.s, z6.b, z2.b\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "udot z15.s, z7.b, z1.b\n"
+      "udot z19.s, z7.b, z2.b\n"
+      "bne 25b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "st1w { z8.s }, p3, [x28]\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22]\n"
+      "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+      "30:"  // Height 3: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 22b\n"
+      "b 62f\n"
+      "31:"  // Height 4
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "32:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "tbz %x[flags], #0, 33f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "b 34f\n"
+      "33:"  // Height 4: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "34:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "35:"  // Height 4: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 36f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 37f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 37f\n"
+      "36:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "37:"  // Height 4: input setup done
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 39f\n"
+      "38:"  // Height 4: Multiply loop: Main loop
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x25, x25, #0x4\n"
+      "subs x26, x26, #0x4\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x24, x24, #0x4\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "add x23, x23, #0x4\n"
+      "add x22, x22, #0x4\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "udot z21.s, z7.b, z3.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "udot z18.s, z6.b, z2.b\n"
+      "udot z22.s, z6.b, z3.b\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "udot z15.s, z7.b, z1.b\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "udot z19.s, z7.b, z2.b\n"
+      "udot z23.s, z7.b, z3.b\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 38b\n"
+      "39:"  // Height 4: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x27, x27, #0x1\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "cmp x27, x19\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "udot z21.s, z7.b, z3.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "udot z18.s, z6.b, z2.b\n"
+      "udot z22.s, z6.b, z3.b\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "udot z15.s, z7.b, z1.b\n"
+      "udot z19.s, z7.b, z2.b\n"
+      "udot z23.s, z7.b, z3.b\n"
+      "bne 35b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "st1w { z8.s }, p3, [x28]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22]\n"
+      "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x21]\n"
+      "st1w { z21.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x21, #3, MUL VL]\n"
+      "40:"  // Height 4: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 32b\n"
+      "b 62f\n"
+      "41:"  // Height 5
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "42:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "tbz %x[flags], #0, 43f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x20]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "b 44f\n"
+      "43:"  // Height 5: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "44:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "45:"  // Height 5: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 46f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 47f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "b 47f\n"
+      "46:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "47:"  // Height 5: input setup done
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1rw { z4.s }, p4/Z, [x21]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 49f\n"
+      "48:"  // Height 5: Multiply loop: Main loop
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x25, x25, #0x4\n"
+      "subs x26, x26, #0x4\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "udot z24.s, z6.b, z4.b\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x22, x22, #0x4\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "add x21, x21, #0x4\n"
+      "udot z21.s, z7.b, z3.b\n"
+      "udot z25.s, z7.b, z4.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "udot z18.s, z6.b, z2.b\n"
+      "udot z22.s, z6.b, z3.b\n"
+      "udot z26.s, z6.b, z4.b\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "udot z15.s, z7.b, z1.b\n"
+      "udot z19.s, z7.b, z2.b\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "udot z23.s, z7.b, z3.b\n"
+      "udot z27.s, z7.b, z4.b\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1rw { z4.s }, p4/Z, [x21]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 48b\n"
+      "49:"  // Height 5: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x27, x27, #0x1\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "cmp x27, x19\n"
+      "udot z24.s, z6.b, z4.b\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "udot z21.s, z7.b, z3.b\n"
+      "udot z25.s, z7.b, z4.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "udot z18.s, z6.b, z2.b\n"
+      "udot z22.s, z6.b, z3.b\n"
+      "udot z26.s, z6.b, z4.b\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "udot z15.s, z7.b, z1.b\n"
+      "udot z19.s, z7.b, z2.b\n"
+      "udot z23.s, z7.b, z3.b\n"
+      "udot z27.s, z7.b, z4.b\n"
+      "bne 45b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "st1w { z8.s }, p3, [x28]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22]\n"
+      "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x21]\n"
+      "st1w { z21.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x20]\n"
+      "st1w { z25.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x20, #3, MUL VL]\n"
+      "50:"  // Height 5: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 42b\n"
+      "b 62f\n"
+      "51:"  // Height 6
+      "ldr x20, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "mov x19, #0x18\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "madd %x[output_ptr], x20, x19, %x[output_ptr]\n"
+      "52:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p0.s, x19, x10\n"
+      "tbz %x[flags], #0, 53f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z8.s }, p3/Z, [x28]\n"
+      "ld1w { z9.s }, p2/Z, [x28, #1, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "ld1w { z10.s }, p1/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z11.s }, p0/Z, [x28, #3, MUL VL]\n"
+      "add x19, x20, x19, LSL #2\n"
+      "ld1w { z12.s }, p3/Z, [x23]\n"
+      "ld1w { z13.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z14.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x22]\n"
+      "ld1w { z17.s }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z19.s }, p0/Z, [x22, #3, MUL VL]\n"
+      "ld1w { z20.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z23.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "ld1w { z24.s }, p3/Z, [x20]\n"
+      "ld1w { z25.s }, p2/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x20, #2, MUL VL]\n"
+      "ld1w { z27.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "ld1w { z28.s }, p3/Z, [x19]\n"
+      "ld1w { z29.s }, p2/Z, [x19, #1, MUL VL]\n"
+      "ld1w { z30.s }, p1/Z, [x19, #2, MUL VL]\n"
+      "ld1w { z31.s }, p0/Z, [x19, #3, MUL VL]\n"
+      "b 54f\n"
+      "53:"  // Height 6: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "54:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "55:"  // Height 6: String loop
+      "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr w26, [x19, x27, LSL #0x2]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 56f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 57f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "add x20, x20, x19\n"
+      "b 57f\n"
+      "56:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "add x20, x21, x19\n"
+      "57:"  // Height 6: input setup done
+      "subs x26, x26, #0x4\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "ld1rw { z4.s }, p4/Z, [x21]\n"
+      "ld1rw { z5.s }, p4/Z, [x20]\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "ble 59f\n"
+      "58:"  // Height 6: Multiply loop: Main loop
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x25, x25, #0x4\n"
+      "subs x26, x26, #0x4\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "add x24, x24, #0x4\n"
+      "add x23, x23, #0x4\n"
+      "udot z24.s, z6.b, z4.b\n"
+      "udot z28.s, z6.b, z5.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "add x22, x22, #0x4\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "add x21, x21, #0x4\n"
+      "add x20, x20, #0x4\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "udot z21.s, z7.b, z3.b\n"
+      "udot z25.s, z7.b, z4.b\n"
+      "udot z29.s, z7.b, z5.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "udot z18.s, z6.b, z2.b\n"
+      "udot z22.s, z6.b, z3.b\n"
+      "udot z26.s, z6.b, z4.b\n"
+      "udot z30.s, z6.b, z5.b\n"
+      "ld1b { z6.b }, p4/Z, [x9]\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "udot z15.s, z7.b, z1.b\n"
+      "ld1rw { z0.s }, p4/Z, [x25]\n"
+      "ld1rw { z1.s }, p4/Z, [x24]\n"
+      "udot z19.s, z7.b, z2.b\n"
+      "udot z23.s, z7.b, z3.b\n"
+      "ld1rw { z2.s }, p4/Z, [x23]\n"
+      "ld1rw { z3.s }, p4/Z, [x22]\n"
+      "udot z27.s, z7.b, z4.b\n"
+      "udot z31.s, z7.b, z5.b\n"
+      "ld1rw { z4.s }, p4/Z, [x21]\n"
+      "ld1rw { z5.s }, p4/Z, [x20]\n"
+      "ld1b { z7.b }, p4/Z, [x9, #1, MUL VL]\n"
+      "bgt 58b\n"
+      "59:"  // Height 6: Multiply loop: Main loop skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "udot z8.s, z6.b, z0.b\n"
+      "udot z12.s, z6.b, z1.b\n"
+      "add x27, x27, #0x1\n"
+      "udot z16.s, z6.b, z2.b\n"
+      "udot z20.s, z6.b, z3.b\n"
+      "cmp x27, x19\n"
+      "udot z24.s, z6.b, z4.b\n"
+      "udot z28.s, z6.b, z5.b\n"
+      "ld1b { z6.b }, p4/Z, [x9, #2, MUL VL]\n"
+      "udot z9.s, z7.b, z0.b\n"
+      "udot z13.s, z7.b, z1.b\n"
+      "udot z17.s, z7.b, z2.b\n"
+      "udot z21.s, z7.b, z3.b\n"
+      "udot z25.s, z7.b, z4.b\n"
+      "udot z29.s, z7.b, z5.b\n"
+      "ld1b { z7.b }, p4/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "udot z10.s, z6.b, z0.b\n"
+      "udot z14.s, z6.b, z1.b\n"
+      "udot z18.s, z6.b, z2.b\n"
+      "udot z22.s, z6.b, z3.b\n"
+      "udot z26.s, z6.b, z4.b\n"
+      "udot z30.s, z6.b, z5.b\n"
+      "udot z11.s, z7.b, z0.b\n"
+      "udot z15.s, z7.b, z1.b\n"
+      "udot z19.s, z7.b, z2.b\n"
+      "udot z23.s, z7.b, z3.b\n"
+      "udot z27.s, z7.b, z4.b\n"
+      "udot z31.s, z7.b, z5.b\n"
+      "bne 55b\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "add x22, x23, x19, LSL #2\n"
+      "st1w { z8.s }, p3, [x28]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "add x20, x21, x19, LSL #2\n"
+      "st1w { z9.s }, p2, [x28, #1, MUL VL]\n"
+      "add x19, x20, x19, LSL #2\n"
+      "st1w { z10.s }, p1, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z12.s }, p3, [x23]\n"
+      "st1w { z13.s }, p2, [x23, #1, MUL VL]\n"
+      "st1w { z14.s }, p1, [x23, #2, MUL VL]\n"
+      "st1w { z15.s }, p0, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p3, [x22]\n"
+      "st1w { z17.s }, p2, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p0, [x22, #3, MUL VL]\n"
+      "st1w { z20.s }, p3, [x21]\n"
+      "st1w { z21.s }, p2, [x21, #1, MUL VL]\n"
+      "st1w { z22.s }, p1, [x21, #2, MUL VL]\n"
+      "st1w { z23.s }, p0, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p3, [x20]\n"
+      "st1w { z25.s }, p2, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [x20, #3, MUL VL]\n"
+      "st1w { z28.s }, p3, [x19]\n"
+      "st1w { z29.s }, p2, [x19, #1, MUL VL]\n"
+      "st1w { z30.s }, p1, [x19, #2, MUL VL]\n"
+      "st1w { z31.s }, p0, [x19, #3, MUL VL]\n"
+      "60:"  // Height 6: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 52b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 62f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "61:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "62:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
index 413bc65288..fc8bdb50a9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -137,13 +137,12 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "udot z8.s, z6.b, z0.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "udot z9.s, z7.b, z0.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "cmp x26, #0x10\n"
+      "add x25, x25, #0x10\n"
       "udot z10.s, z6.b, z0.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
       "udot z11.s, z7.b, z0.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #5, MUL VL]\n"
       "udot z8.s, z6.b, z0.b[1]\n"
@@ -178,7 +177,6 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "udot z8.s, z6.b, z0.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "add x25, x25, #0x10\n"
       "udot z9.s, z7.b, z0.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
@@ -217,9 +215,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "udot z10.s, z6.b, z0.b[3]\n"
       "udot z11.s, z7.b, z0.b[3]\n"
       "10:"  // Height 1: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 5b\n"
       "st1w { z8.s }, p4, [x28]\n"
@@ -296,16 +293,14 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "udot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "udot z9.s, z7.b, z0.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x25, x25, #0x10\n"
       "add x24, x24, #0x10\n"
       "udot z12.s, z6.b, z1.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "cmp x26, #0x10\n"
       "udot z13.s, z7.b, z1.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "udot z10.s, z6.b, z0.b[0]\n"
       "udot z14.s, z6.b, z1.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #4, MUL VL]\n"
@@ -356,9 +351,7 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "udot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "udot z9.s, z7.b, z0.b[0]\n"
-      "add x24, x24, #0x10\n"
       "udot z12.s, z6.b, z1.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
       "udot z13.s, z7.b, z1.b[0]\n"
@@ -413,10 +406,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "udot z11.s, z7.b, z0.b[3]\n"
       "udot z15.s, z7.b, z1.b[3]\n"
       "21:"  // Height 2: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 16b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -511,21 +502,18 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "udot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "udot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "udot z12.s, z6.b, z1.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x24, x24, #0x10\n"
+      "udot z13.s, z7.b, z1.b[0]\n"
       "add x23, x23, #0x10\n"
       "udot z16.s, z6.b, z2.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "cmp x26, #0x10\n"
-      "udot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
       "udot z17.s, z7.b, z2.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "udot z10.s, z6.b, z0.b[0]\n"
       "udot z14.s, z6.b, z1.b[0]\n"
       "udot z18.s, z6.b, z2.b[0]\n"
@@ -590,12 +578,9 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "udot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "udot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
       "udot z12.s, z6.b, z1.b[0]\n"
-      "add x23, x23, #0x10\n"
       "udot z13.s, z7.b, z1.b[0]\n"
       "udot z16.s, z6.b, z2.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
@@ -665,11 +650,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "udot z15.s, z7.b, z1.b[3]\n"
       "udot z19.s, z7.b, z2.b[3]\n"
       "32:"  // Height 3: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 27b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -781,26 +763,22 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "udot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "udot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "udot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "udot z16.s, z6.b, z2.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "udot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
+      "add x22, x22, #0x10\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
       "udot z20.s, z6.b, z3.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "udot z21.s, z7.b, z3.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "udot z10.s, z6.b, z0.b[0]\n"
       "udot z14.s, z6.b, z1.b[0]\n"
       "udot z18.s, z6.b, z2.b[0]\n"
@@ -879,19 +857,15 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "udot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "udot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
-      "add x22, x22, #0x10\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
       "udot z13.s, z7.b, z1.b[0]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
       "udot z20.s, z6.b, z3.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
       "udot z21.s, z7.b, z3.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
       "addvl x9, x9, #4\n"
@@ -972,12 +946,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "udot z19.s, z7.b, z2.b[3]\n"
       "udot z23.s, z7.b, z3.b[3]\n"
       "43:"  // Height 4: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 38b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1106,32 +1076,27 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "udot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "udot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "udot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "udot z16.s, z6.b, z2.b[0]\n"
       "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "udot z13.s, z7.b, z1.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x22, x22, #0x10\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
       "add x21, x21, #0x10\n"
       "udot z20.s, z6.b, z3.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
       "udot z24.s, z6.b, z4.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "udot z21.s, z7.b, z3.b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "udot z25.s, z7.b, z4.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
       "udot z10.s, z6.b, z0.b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "udot z14.s, z6.b, z1.b[0]\n"
       "udot z18.s, z6.b, z2.b[0]\n"
       "udot z22.s, z6.b, z3.b[0]\n"
@@ -1223,22 +1188,17 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "udot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "udot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
       "udot z13.s, z7.b, z1.b[0]\n"
-      "add x21, x21, #0x10\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
       "udot z20.s, z6.b, z3.b[0]\n"
       "udot z24.s, z6.b, z4.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
       "udot z21.s, z7.b, z3.b[0]\n"
       "udot z25.s, z7.b, z4.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
@@ -1334,13 +1294,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "udot z23.s, z7.b, z3.b[3]\n"
       "udot z27.s, z7.b, z4.b[3]\n"
       "54:"  // Height 5: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 49b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
@@ -1489,37 +1444,31 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "udot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
+      "cmp x26, #0x10\n"
       "udot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
+      "add x25, x25, #0x10\n"
       "udot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
+      "add x24, x24, #0x10\n"
       "udot z16.s, z6.b, z2.b[0]\n"
       "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
+      "add x23, x23, #0x10\n"
       "udot z13.s, z7.b, z1.b[0]\n"
       "ld1rqb { z5.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      "add x22, x22, #0x10\n"
       "udot z20.s, z6.b, z3.b[0]\n"
-      "prfm pldl1keep, [x25, #0x80]\n"
+      "add x21, x21, #0x10\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
       "add x20, x20, #0x10\n"
       "udot z24.s, z6.b, z4.b[0]\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "cmp x26, #0x10\n"
       "udot z28.s, z6.b, z5.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
       "udot z21.s, z7.b, z3.b[0]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
       "udot z25.s, z7.b, z4.b[0]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
       "udot z29.s, z7.b, z5.b[0]\n"
       "ld1b { z7.b }, p5/Z, [x9, #3, MUL VL]\n"
       "udot z10.s, z6.b, z0.b[0]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "udot z14.s, z6.b, z1.b[0]\n"
       "udot z18.s, z6.b, z2.b[0]\n"
       "udot z22.s, z6.b, z3.b[0]\n"
@@ -1625,25 +1574,19 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "ld1rqb { z0.b }, p0/Z, [x25]\n"
       "udot z8.s, z6.b, z0.b[0]\n"
       "ld1rqb { z1.b }, p0/Z, [x24]\n"
-      "add x25, x25, #0x10\n"
       "udot z9.s, z7.b, z0.b[0]\n"
       "ld1rqb { z2.b }, p0/Z, [x23]\n"
-      "add x24, x24, #0x10\n"
-      "udot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z3.b }, p0/Z, [x22]\n"
-      "add x23, x23, #0x10\n"
-      "udot z16.s, z6.b, z2.b[0]\n"
+      "udot z12.s, z6.b, z1.b[0]\n"
       "ld1rqb { z4.b }, p0/Z, [x21]\n"
-      "add x22, x22, #0x10\n"
       "udot z13.s, z7.b, z1.b[0]\n"
       "ld1rqb { z5.b }, p0/Z, [x20]\n"
-      "add x21, x21, #0x10\n"
+      "udot z16.s, z6.b, z2.b[0]\n"
       "udot z20.s, z6.b, z3.b[0]\n"
-      "add x20, x20, #0x10\n"
-      "udot z17.s, z7.b, z2.b[0]\n"
       "udot z24.s, z6.b, z4.b[0]\n"
       "udot z28.s, z6.b, z5.b[0]\n"
       "ld1b { z6.b }, p5/Z, [x9, #2, MUL VL]\n"
+      "udot z17.s, z7.b, z2.b[0]\n"
       "udot z21.s, z7.b, z3.b[0]\n"
       "udot z25.s, z7.b, z4.b[0]\n"
       "udot z29.s, z7.b, z5.b[0]\n"
@@ -1754,14 +1697,8 @@ void sve_hybrid_u8u32_dot_6x4VL (
       "udot z27.s, z7.b, z4.b[3]\n"
       "udot z31.s, z7.b, z5.b[3]\n"
       "65:"  // Height 6: Multiply loop: multiply skip
-      "prfm pldl1keep, [x25, #0x80]\n"
-      "add x27, x27, #0x1\n"
-      "prfm pldl1keep, [x24, #0x80]\n"
-      "prfm pldl1keep, [x23, #0x80]\n"
-      "prfm pldl1keep, [x22, #0x80]\n"
-      "prfm pldl1keep, [x21, #0x80]\n"
-      "prfm pldl1keep, [x20, #0x80]\n"
       "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
       "cmp x27, x19\n"
       "bne 60b\n"
       "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
new file mode 100644
index 0000000000..7f8eadc528
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL.hpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#ifdef ARM_COMPUTE_ENABLE_SVE
+#include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
+
+#define ARGLIST  \
+    unsigned int, const unsigned int *, \
+    IndirectInputArg<uint8_t>, \
+    size_t, size_t, \
+    const uint8_t *, \
+    IndirectOutputArg<uint32_t>, \
+    const uint32_t *, Activation, bool
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void sve_hybrid_u8u32_mmla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_u8u32_mmla_6x4VL
+{
+public:
+    typedef uint8_t lhs_operand_type;
+    typedef uint8_t rhs_operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)( ARGLIST );
+
+    /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 6;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<uint32_t>() * 4;
+    }
+
+    static constexpr unsigned int k_unroll()
+    {
+        return 8;
+    }
+
+    static constexpr bool supports_accumulate()
+    {
+        return true;
+    }
+
+    StdTransformsSVE<rhs_operand_type, result_type, 6, 8, 8> transforms = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 54.45 };
+                case CPUModel::A510:
+                    return { 24.22 };
+                case CPUModel::V1:
+                    return { 105.16 };
+            }
+        }
+
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 54.90, 15.69, 0.62 };
+                case CPUModel::A510:
+                    return { 26.80, 3.89, 0.47 };
+                case CPUModel::V1:
+                    return { 108.33, 18.66, 0.92 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_u8u32_mmla_6x4VL;
+    cls_sve_hybrid_u8u32_mmla_6x4VL(const CPUInfo *)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..e8bad69ccd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_mmla_6x4VL/generic.cpp
@@ -0,0 +1,1675 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8u32_mmla_6x4VL (
+    unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+    size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+    const uint32_t *, Activation, bool accumulate
+)
+{
+    struct KernelArgs {
+        unsigned int num_strings = {};
+        const unsigned int *string_lengths = {};
+        size_t N = {};
+        const uint8_t *B_ptr = {};
+        size_t output_offset = {};
+        size_t input_initial_col = {};
+        size_t input_offset = {};
+    } ka;
+
+    unsigned long flags=0;
+    void *output_ptr;
+    void *input_ptr;
+
+    if (output_arg.is_indirect) {
+        output_ptr=(void *)(output_arg.indirect.ptr);
+        ka.output_offset=output_arg.indirect.offset;
+        flags |= 0x4;
+    } else {
+        output_ptr=(void *)(output_arg.direct.base);
+        ka.output_offset=output_arg.direct.stride;
+    }
+
+    if (A_arg.is_indirect) {
+        input_ptr=(void *)(A_arg.indirect.ptr);
+        ka.input_offset=A_arg.indirect.start_row;
+        ka.input_initial_col=A_arg.indirect.start_col;
+        flags |= 0x8;
+    } else {
+        assert(num_strings==1);
+        input_ptr=(void *)(A_arg.direct.base);
+        ka.input_offset=A_arg.direct.stride;
+    }
+    if (accumulate) {
+        flags |= 0x1;
+    }
+    ka.num_strings = num_strings;
+    ka.string_lengths = string_lengths;
+    ka.N = N;
+    ka.B_ptr = B_ptr;
+    __asm__ __volatile__(
+      "ptrue p5.b\n"
+      "1:"  // Row loop
+      "cmp %x[M], #0x6\n"
+      "bge 56f\n"
+      "cmp %x[M], #0x4\n"
+      "bgt 45f\n"
+      "beq 34f\n"
+      "cmp %x[M], #0x2\n"
+      "bgt 23f\n"
+      "beq 12f\n"
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "2:"  // Height 1: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "tbz %x[flags], #0, 3f\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 4f\n"
+      "3:"  // Height 1: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "4:"  // Height 1: setup done
+      "mov x27, #0x0\n"
+      "5:"  // Height 1: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 6f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "cbnz x27, 7f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "b 7f\n"
+      "6:"  // Height 1: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "7:"  // Height 1: input setup done
+      "cmp x26, #0x10\n"
+      "ble 9f\n"
+      "8:"  // Height 1: Multiply loop: Main loop head
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "cmp x26, #0x10\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      "bgt 8b\n"
+      "9:"  // Height 1: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "subs x26, x26, #0x8\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      "ble 10f\n"
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      "10:"  // Height 1: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 5b\n"
+      "uzp1 z8.d, z8.d, z12.d\n"
+      "st1w { z8.s }, p4, [x28]\n"
+      "uzp1 z9.d, z9.d, z13.d\n"
+      "uzp1 z10.d, z10.d, z14.d\n"
+      "st1w { z9.s }, p3, [x28, #1, MUL VL]\n"
+      "uzp1 z11.d, z11.d, z15.d\n"
+      "st1w { z10.s }, p2, [x28, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "11:"  // Height 1: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 2b\n"
+      "b 68f\n"
+      "12:"  // Height 2
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "13:"  // Height 2: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "tbz %x[flags], #0, 14f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "b 15f\n"
+      "14:"  // Height 2: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "15:"  // Height 2: setup done
+      "mov x27, #0x0\n"
+      "16:"  // Height 2: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 17f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "cbnz x27, 18f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "b 18f\n"
+      "17:"  // Height 2: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "18:"  // Height 2: input setup done
+      "cmp x26, #0x10\n"
+      "ble 20f\n"
+      "19:"  // Height 2: Multiply loop: Main loop head
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "sub x26, x26, #0x10\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "cmp x26, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "add x25, x25, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      "bgt 19b\n"
+      "20:"  // Height 2: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "subs x26, x26, #0x8\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      "ble 21f\n"
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      "21:"  // Height 2: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 16b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "st1w { z7.s }, p4, [x28]\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "st1w { z8.s }, p4, [x23]\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+      "22:"  // Height 2: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 13b\n"
+      "b 68f\n"
+      "23:"  // Height 3
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "24:"  // Height 3: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "tbz %x[flags], #0, 25f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 26f\n"
+      "25:"  // Height 3: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "26:"  // Height 3: setup done
+      "mov x27, #0x0\n"
+      "27:"  // Height 3: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 28f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "cbnz x27, 29f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "b 29f\n"
+      "28:"  // Height 3: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "29:"  // Height 3: input setup done
+      "cmp x26, #0x10\n"
+      "ble 31f\n"
+      "30:"  // Height 3: Multiply loop: Main loop head
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "sub x26, x26, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "cmp x26, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "add x25, x25, #0x10\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
+      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
+      "add x23, x23, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
+      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
+      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
+      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
+      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
+      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
+      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
+      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
+      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
+      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
+      "bgt 30b\n"
+      "31:"  // Height 3: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "subs x26, x26, #0x8\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
+      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
+      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
+      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
+      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
+      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
+      "ble 32f\n"
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
+      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
+      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
+      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
+      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
+      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
+      "32:"  // Height 3: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 27b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "st1w { z7.s }, p4, [x28]\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "uzp1 z16.d, z16.d, z20.d\n"
+      "addvl x28, x28, #4\n"
+      "uzp1 z17.d, z17.d, z21.d\n"
+      "st1w { z8.s }, p4, [x23]\n"
+      "uzp1 z18.d, z18.d, z22.d\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
+      "uzp1 z19.d, z19.d, z23.d\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x22]\n"
+      "st1w { z17.s }, p3, [x22, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x22, #3, MUL VL]\n"
+      "33:"  // Height 3: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 24b\n"
+      "b 68f\n"
+      "34:"  // Height 4
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "35:"  // Height 4: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "tbz %x[flags], #0, 36f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "b 37f\n"
+      "36:"  // Height 4: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "37:"  // Height 4: setup done
+      "mov x27, #0x0\n"
+      "38:"  // Height 4: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 39f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "cbnz x27, 40f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "b 40f\n"
+      "39:"  // Height 4: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "40:"  // Height 4: input setup done
+      "cmp x26, #0x10\n"
+      "ble 42f\n"
+      "41:"  // Height 4: Multiply loop: Main loop head
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "sub x26, x26, #0x10\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "cmp x26, #0x10\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "add x25, x25, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x24, x24, #0x10\n"
+      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
+      "add x23, x23, #0x10\n"
+      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
+      "add x22, x22, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
+      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
+      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
+      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
+      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
+      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
+      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
+      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
+      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
+      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
+      "bgt 41b\n"
+      "42:"  // Height 4: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "subs x26, x26, #0x8\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
+      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
+      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
+      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
+      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
+      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
+      "ble 43f\n"
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
+      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
+      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
+      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
+      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
+      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
+      "43:"  // Height 4: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 38b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "st1w { z7.s }, p4, [x28]\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "addvl x28, x28, #4\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "st1w { z8.s }, p4, [x23]\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "st1w { z15.s }, p4, [x22]\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "st1w { z20.s }, p3, [x22, #1, MUL VL]\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
+      "st1w { z22.s }, p1, [x22, #3, MUL VL]\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "44:"  // Height 4: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 35b\n"
+      "b 68f\n"
+      "45:"  // Height 5
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "46:"  // Height 5: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "tbz %x[flags], #0, 47f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "ld1w { z6.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z6.d, z31.d\n"
+      "zip2 z31.d, z6.d, z31.d\n"
+      "b 48f\n"
+      "47:"  // Height 5: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "48:"  // Height 5: setup done
+      "mov x27, #0x0\n"
+      "49:"  // Height 5: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 50f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "cbnz x27, 51f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "b 51f\n"
+      "50:"  // Height 5: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "51:"  // Height 5: input setup done
+      "cmp x26, #0x10\n"
+      "ble 53f\n"
+      "52:"  // Height 5: Multiply loop: Main loop head
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "sub x26, x26, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "cmp x26, #0x10\n"
+      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "add x25, x25, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "add x24, x24, #0x10\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "add x23, x23, #0x10\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "add x22, x22, #0x10\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
+      ".inst 0x45c79898  // ummla z24.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
+      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
+      ".inst 0x45c6989c  // ummla z28.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
+      ".inst 0x45c79899  // ummla z25.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
+      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
+      ".inst 0x45c6989d  // ummla z29.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
+      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
+      ".inst 0x45c7989a  // ummla z26.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
+      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
+      ".inst 0x45c6989e  // ummla z30.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
+      ".inst 0x45c7989b  // ummla z27.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
+      ".inst 0x45c6989f  // ummla z31.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
+      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
+      ".inst 0x45c798b8  // ummla z24.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
+      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
+      ".inst 0x45c698bc  // ummla z28.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
+      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
+      ".inst 0x45c798b9  // ummla z25.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
+      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
+      ".inst 0x45c698bd  // ummla z29.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
+      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
+      ".inst 0x45c798ba  // ummla z26.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
+      ".inst 0x45c698be  // ummla z30.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
+      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
+      ".inst 0x45c798bb  // ummla z27.s, z5.b, z7.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
+      ".inst 0x45c698bf  // ummla z31.s, z5.b, z6.b\n"
+      "bgt 52b\n"
+      "53:"  // Height 5: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "subs x26, x26, #0x8\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
+      ".inst 0x45c79898  // ummla z24.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
+      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
+      ".inst 0x45c6989c  // ummla z28.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
+      ".inst 0x45c79899  // ummla z25.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
+      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
+      ".inst 0x45c6989d  // ummla z29.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
+      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
+      ".inst 0x45c7989a  // ummla z26.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
+      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
+      ".inst 0x45c6989e  // ummla z30.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
+      ".inst 0x45c7989b  // ummla z27.s, z4.b, z7.b\n"
+      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
+      ".inst 0x45c6989f  // ummla z31.s, z4.b, z6.b\n"
+      "ble 54f\n"
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
+      ".inst 0x45c798b8  // ummla z24.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
+      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
+      ".inst 0x45c698bc  // ummla z28.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
+      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
+      ".inst 0x45c798b9  // ummla z25.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
+      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
+      ".inst 0x45c698bd  // ummla z29.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
+      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
+      ".inst 0x45c798ba  // ummla z26.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
+      ".inst 0x45c698be  // ummla z30.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
+      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
+      ".inst 0x45c798bb  // ummla z27.s, z5.b, z7.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
+      ".inst 0x45c698bf  // ummla z31.s, z5.b, z6.b\n"
+      "54:"  // Height 5: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 49b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "st1w { z7.s }, p4, [x28]\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "add x20, x21, x19, LSL #2\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "st1w { z8.s }, p4, [x23]\n"
+      "addvl x28, x28, #4\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "st1w { z15.s }, p4, [x22]\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "st1w { z20.s }, p3, [x22, #1, MUL VL]\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
+      "uzp1 z24.d, z24.d, z28.d\n"
+      "st1w { z22.s }, p1, [x22, #3, MUL VL]\n"
+      "uzp1 z25.d, z25.d, z29.d\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "uzp1 z26.d, z26.d, z30.d\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "uzp1 z27.d, z27.d, z31.d\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x20]\n"
+      "st1w { z25.s }, p3, [x20, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #3, MUL VL]\n"
+      "55:"  // Height 5: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 46b\n"
+      "b 68f\n"
+      "56:"  // Height 6
+      "ldr x10, [%x[args_ptr], %[offsetof_N]]\n"
+      "mov x28, %x[output_ptr]\n"
+      "ldr x9, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+      "mov x20, #0x18\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "madd %x[output_ptr], x19, x20, %x[output_ptr]\n"
+      "57:"  // Height 6: Column loop
+      "mov x19, #0x0\n"
+      "whilelt p4.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p3.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p2.s, x19, x10\n"
+      "incw x19\n"
+      "whilelt p1.s, x19, x10\n"
+      "tbz %x[flags], #0, 58f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "ld1w { z9.s }, p4/Z, [x28]\n"
+      "add x23, x28, x19, LSL #2\n"
+      "ld1w { z10.s }, p3/Z, [x28, #1, MUL VL]\n"
+      "ld1w { z11.s }, p2/Z, [x28, #2, MUL VL]\n"
+      "add x22, x23, x19, LSL #2\n"
+      "ld1w { z16.s }, p1/Z, [x28, #3, MUL VL]\n"
+      "add x21, x22, x19, LSL #2\n"
+      "ld1w { z12.s }, p4/Z, [x23]\n"
+      "zip1 z8.d, z9.d, z12.d\n"
+      "ld1w { z13.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "add x20, x21, x19, LSL #2\n"
+      "zip2 z12.d, z9.d, z12.d\n"
+      "ld1w { z14.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "add x19, x20, x19, LSL #2\n"
+      "zip1 z9.d, z10.d, z13.d\n"
+      "ld1w { z15.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "zip2 z13.d, z10.d, z13.d\n"
+      "ld1w { z17.s }, p4/Z, [x22]\n"
+      "zip1 z10.d, z11.d, z14.d\n"
+      "ld1w { z18.s }, p3/Z, [x22, #1, MUL VL]\n"
+      "zip2 z14.d, z11.d, z14.d\n"
+      "ld1w { z19.s }, p2/Z, [x22, #2, MUL VL]\n"
+      "zip1 z11.d, z16.d, z15.d\n"
+      "ld1w { z24.s }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip2 z15.d, z16.d, z15.d\n"
+      "ld1w { z20.s }, p4/Z, [x21]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "zip1 z16.d, z17.d, z20.d\n"
+      "ld1w { z22.s }, p2/Z, [x21, #2, MUL VL]\n"
+      "zip2 z20.d, z17.d, z20.d\n"
+      "ld1w { z23.s }, p1/Z, [x21, #3, MUL VL]\n"
+      "zip1 z17.d, z18.d, z21.d\n"
+      "ld1w { z25.s }, p4/Z, [x20]\n"
+      "zip2 z21.d, z18.d, z21.d\n"
+      "ld1w { z26.s }, p3/Z, [x20, #1, MUL VL]\n"
+      "zip1 z18.d, z19.d, z22.d\n"
+      "ld1w { z27.s }, p2/Z, [x20, #2, MUL VL]\n"
+      "zip2 z22.d, z19.d, z22.d\n"
+      "ld1w { z6.s }, p1/Z, [x20, #3, MUL VL]\n"
+      "zip1 z19.d, z24.d, z23.d\n"
+      "ld1w { z28.s }, p4/Z, [x19]\n"
+      "zip2 z23.d, z24.d, z23.d\n"
+      "ld1w { z29.s }, p3/Z, [x19, #1, MUL VL]\n"
+      "ld1w { z30.s }, p2/Z, [x19, #2, MUL VL]\n"
+      "zip1 z24.d, z25.d, z28.d\n"
+      "ld1w { z31.s }, p1/Z, [x19, #3, MUL VL]\n"
+      "zip2 z28.d, z25.d, z28.d\n"
+      "zip1 z25.d, z26.d, z29.d\n"
+      "zip2 z29.d, z26.d, z29.d\n"
+      "zip1 z26.d, z27.d, z30.d\n"
+      "zip2 z30.d, z27.d, z30.d\n"
+      "zip1 z27.d, z6.d, z31.d\n"
+      "zip2 z31.d, z6.d, z31.d\n"
+      "b 59f\n"
+      "58:"  // Height 6: no accumulate
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "59:"  // Height 6: setup done
+      "mov x27, #0x0\n"
+      "60:"  // Height 6: String loop
+      "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "ldr w26, [x20, x27, LSL #0x2]\n"
+      "tbz %x[flags], #3, 61f\n"
+      "ldr x20, [%x[input_ptr], x27, LSL #0x3]\n"
+      "add x20, x20, x19, LSL #3\n"
+      "ldr x25, [x20, #0x0]\n"
+      "ldr x24, [x20, #0x8]\n"
+      "ldr x23, [x20, #0x10]\n"
+      "ldr x22, [x20, #0x18]\n"
+      "ldr x21, [x20, #0x20]\n"
+      "ldr x20, [x20, #0x28]\n"
+      "cbnz x27, 62f\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+      "add x25, x25, x19\n"
+      "add x24, x24, x19\n"
+      "add x23, x23, x19\n"
+      "add x22, x22, x19\n"
+      "add x21, x21, x19\n"
+      "add x20, x20, x19\n"
+      "b 62f\n"
+      "61:"  // Height 6: setup direct input
+      "mov x25, %x[input_ptr]\n"
+      "add x24, x25, x19\n"
+      "add x23, x24, x19\n"
+      "add x22, x23, x19\n"
+      "add x21, x22, x19\n"
+      "add x20, x21, x19\n"
+      "62:"  // Height 6: input setup done
+      "cmp x26, #0x10\n"
+      "ble 64f\n"
+      "63:"  // Height 6: Multiply loop: Main loop head
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "sub x26, x26, #0x10\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "cmp x26, #0x10\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "add x25, x25, #0x10\n"
+      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      "add x24, x24, #0x10\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "ld1rqb { z6.b }, p0/Z, [x20]\n"
+      "add x23, x23, #0x10\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "add x22, x22, #0x10\n"
+      "add x21, x21, #0x10\n"
+      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
+      "add x20, x20, #0x10\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45c79898  // ummla z24.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
+      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
+      ".inst 0x45c6989c  // ummla z28.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
+      ".inst 0x45c79899  // ummla z25.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
+      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
+      ".inst 0x45c6989d  // ummla z29.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
+      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
+      ".inst 0x45c7989a  // ummla z26.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
+      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
+      ".inst 0x45c6989e  // ummla z30.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #16\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
+      ".inst 0x45c7989b  // ummla z27.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-8, MUL VL]\n"
+      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
+      ".inst 0x45c6989f  // ummla z31.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-7, MUL VL]\n"
+      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
+      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
+      ".inst 0x45c798b8  // ummla z24.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-6, MUL VL]\n"
+      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
+      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
+      ".inst 0x45c698bc  // ummla z28.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-5, MUL VL]\n"
+      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
+      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
+      ".inst 0x45c798b9  // ummla z25.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-4, MUL VL]\n"
+      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
+      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
+      ".inst 0x45c698bd  // ummla z29.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-3, MUL VL]\n"
+      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
+      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
+      ".inst 0x45c798ba  // ummla z26.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #-2, MUL VL]\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
+      ".inst 0x45c698be  // ummla z30.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #-1, MUL VL]\n"
+      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
+      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
+      ".inst 0x45c798bb  // ummla z27.s, z5.b, z7.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
+      ".inst 0x45c698bf  // ummla z31.s, z5.b, z6.b\n"
+      "bgt 63b\n"
+      "64:"  // Height 6: Multiply loop: Single iteration only
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      "whilelt p0.b, XZR, x26\n"
+      "subs x26, x26, #0x8\n"
+      "ld1rqb { z1.b }, p0/Z, [x25]\n"
+      "ld1rqb { z2.b }, p0/Z, [x24]\n"
+      "trn1 z0.d, z1.d, z2.d\n"
+      "ld1rqb { z3.b }, p0/Z, [x23]\n"
+      "trn2 z1.d, z1.d, z2.d\n"
+      "ld1rqb { z4.b }, p0/Z, [x22]\n"
+      "ld1rqb { z5.b }, p0/Z, [x21]\n"
+      ".inst 0x45c79808  // ummla z8.s, z0.b, z7.b\n"
+      "ld1rqb { z6.b }, p0/Z, [x20]\n"
+      "trn1 z2.d, z3.d, z4.d\n"
+      "trn2 z3.d, z3.d, z4.d\n"
+      "trn1 z4.d, z5.d, z6.d\n"
+      "trn2 z5.d, z5.d, z6.d\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45c79850  // ummla z16.s, z2.b, z7.b\n"
+      ".inst 0x45c79898  // ummla z24.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6980c  // ummla z12.s, z0.b, z6.b\n"
+      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
+      ".inst 0x45c6989c  // ummla z28.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79809  // ummla z9.s, z0.b, z7.b\n"
+      ".inst 0x45c79851  // ummla z17.s, z2.b, z7.b\n"
+      ".inst 0x45c79899  // ummla z25.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6980d  // ummla z13.s, z0.b, z6.b\n"
+      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
+      ".inst 0x45c6989d  // ummla z29.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7980a  // ummla z10.s, z0.b, z7.b\n"
+      ".inst 0x45c79852  // ummla z18.s, z2.b, z7.b\n"
+      ".inst 0x45c7989a  // ummla z26.s, z4.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6980e  // ummla z14.s, z0.b, z6.b\n"
+      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
+      ".inst 0x45c6989e  // ummla z30.s, z4.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c79853  // ummla z19.s, z2.b, z7.b\n"
+      ".inst 0x45c7989b  // ummla z27.s, z4.b, z7.b\n"
+      ".inst 0x45c6980f  // ummla z15.s, z0.b, z6.b\n"
+      ".inst 0x45c69857  // ummla z23.s, z2.b, z6.b\n"
+      ".inst 0x45c6989f  // ummla z31.s, z4.b, z6.b\n"
+      "ble 65f\n"
+      "ld1b { z7.b }, p5/Z, [x9]\n"
+      ".inst 0x45c79828  // ummla z8.s, z1.b, z7.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #1, MUL VL]\n"
+      ".inst 0x45c79870  // ummla z16.s, z3.b, z7.b\n"
+      ".inst 0x45c798b8  // ummla z24.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #2, MUL VL]\n"
+      ".inst 0x45c6982c  // ummla z12.s, z1.b, z6.b\n"
+      ".inst 0x45c69874  // ummla z20.s, z3.b, z6.b\n"
+      ".inst 0x45c698bc  // ummla z28.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #3, MUL VL]\n"
+      ".inst 0x45c79829  // ummla z9.s, z1.b, z7.b\n"
+      ".inst 0x45c79871  // ummla z17.s, z3.b, z7.b\n"
+      ".inst 0x45c798b9  // ummla z25.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #4, MUL VL]\n"
+      ".inst 0x45c6982d  // ummla z13.s, z1.b, z6.b\n"
+      ".inst 0x45c69875  // ummla z21.s, z3.b, z6.b\n"
+      ".inst 0x45c698bd  // ummla z29.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #5, MUL VL]\n"
+      ".inst 0x45c7982a  // ummla z10.s, z1.b, z7.b\n"
+      ".inst 0x45c79872  // ummla z18.s, z3.b, z7.b\n"
+      ".inst 0x45c798ba  // ummla z26.s, z5.b, z7.b\n"
+      "ld1b { z7.b }, p5/Z, [x9, #6, MUL VL]\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      ".inst 0x45c69876  // ummla z22.s, z3.b, z6.b\n"
+      ".inst 0x45c698be  // ummla z30.s, z5.b, z6.b\n"
+      "ld1b { z6.b }, p5/Z, [x9, #7, MUL VL]\n"
+      "addvl x9, x9, #8\n"
+      ".inst 0x45c7982b  // ummla z11.s, z1.b, z7.b\n"
+      ".inst 0x45c79873  // ummla z19.s, z3.b, z7.b\n"
+      ".inst 0x45c798bb  // ummla z27.s, z5.b, z7.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      ".inst 0x45c69877  // ummla z23.s, z3.b, z6.b\n"
+      ".inst 0x45c698bf  // ummla z31.s, z5.b, z6.b\n"
+      "65:"  // Height 6: Multiply loop: multiply skip
+      "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+      "add x27, x27, #0x1\n"
+      "cmp x27, x19\n"
+      "bne 60b\n"
+      "uzp1 z7.d, z8.d, z12.d\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+      "uzp2 z8.d, z8.d, z12.d\n"
+      "st1w { z7.s }, p4, [x28]\n"
+      "uzp1 z12.d, z9.d, z13.d\n"
+      "add x23, x28, x19, LSL #2\n"
+      "uzp2 z9.d, z9.d, z13.d\n"
+      "st1w { z12.s }, p3, [x28, #1, MUL VL]\n"
+      "uzp1 z13.d, z10.d, z14.d\n"
+      "add x22, x23, x19, LSL #2\n"
+      "uzp2 z10.d, z10.d, z14.d\n"
+      "st1w { z13.s }, p2, [x28, #2, MUL VL]\n"
+      "uzp1 z14.d, z11.d, z15.d\n"
+      "add x21, x22, x19, LSL #2\n"
+      "uzp2 z11.d, z11.d, z15.d\n"
+      "st1w { z14.s }, p1, [x28, #3, MUL VL]\n"
+      "uzp1 z15.d, z16.d, z20.d\n"
+      "add x20, x21, x19, LSL #2\n"
+      "uzp2 z16.d, z16.d, z20.d\n"
+      "st1w { z8.s }, p4, [x23]\n"
+      "add x19, x20, x19, LSL #2\n"
+      "uzp1 z20.d, z17.d, z21.d\n"
+      "st1w { z9.s }, p3, [x23, #1, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "uzp2 z17.d, z17.d, z21.d\n"
+      "st1w { z10.s }, p2, [x23, #2, MUL VL]\n"
+      "uzp1 z21.d, z18.d, z22.d\n"
+      "st1w { z11.s }, p1, [x23, #3, MUL VL]\n"
+      "uzp2 z18.d, z18.d, z22.d\n"
+      "st1w { z15.s }, p4, [x22]\n"
+      "uzp1 z22.d, z19.d, z23.d\n"
+      "st1w { z20.s }, p3, [x22, #1, MUL VL]\n"
+      "uzp2 z19.d, z19.d, z23.d\n"
+      "st1w { z21.s }, p2, [x22, #2, MUL VL]\n"
+      "uzp1 z23.d, z24.d, z28.d\n"
+      "st1w { z22.s }, p1, [x22, #3, MUL VL]\n"
+      "uzp2 z24.d, z24.d, z28.d\n"
+      "st1w { z16.s }, p4, [x21]\n"
+      "uzp1 z28.d, z25.d, z29.d\n"
+      "st1w { z17.s }, p3, [x21, #1, MUL VL]\n"
+      "uzp2 z25.d, z25.d, z29.d\n"
+      "st1w { z18.s }, p2, [x21, #2, MUL VL]\n"
+      "uzp1 z29.d, z26.d, z30.d\n"
+      "st1w { z19.s }, p1, [x21, #3, MUL VL]\n"
+      "uzp2 z26.d, z26.d, z30.d\n"
+      "st1w { z23.s }, p4, [x20]\n"
+      "uzp1 z30.d, z27.d, z31.d\n"
+      "st1w { z28.s }, p3, [x20, #1, MUL VL]\n"
+      "uzp2 z27.d, z27.d, z31.d\n"
+      "st1w { z29.s }, p2, [x20, #2, MUL VL]\n"
+      "st1w { z30.s }, p1, [x20, #3, MUL VL]\n"
+      "st1w { z24.s }, p4, [x19]\n"
+      "st1w { z25.s }, p3, [x19, #1, MUL VL]\n"
+      "st1w { z26.s }, p2, [x19, #2, MUL VL]\n"
+      "st1w { z27.s }, p1, [x19, #3, MUL VL]\n"
+      "66:"  // Height 6: Writeback done
+      "decw x10, ALL, MUL #4\n"
+      "cmp x10, XZR\n"
+      "bgt 57b\n"
+      "subs %x[M], %x[M], #0x6\n"
+      "beq 68f\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "tbz %x[flags], #3, 67f\n"
+      "add x20, x20, #0x6\n"
+      "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+      "b 1b\n"
+      "67:"  // Update direct input
+      "mov x19, #0x6\n"
+      "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+      "b 1b\n"
+      "68:"  // Exit
+
+      : [M] "+&r" (M), [input_ptr] "+&r" (input_ptr), [output_ptr] "+&r" (output_ptr)
+      : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
index d717b745c9..f5fdf993aa 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,63 +10,92 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
 
 #ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include "../bfloat.hpp"
 #include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const bfloat16 *, const bfloat16 *, \
+    float *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void sve_interleaved_bf16fp32_dot_8x3VL( ARGLIST );
 
-class cls_sve_interleaved_bf16fp32_dot_8x3VL {
+class cls_sve_interleaved_bf16fp32_dot_8x3VL
+{
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
     static unsigned int out_width()
     {
         return get_vector_length<float>() * 3;
     }
 
-    static unsigned int out_height()
+    static unsigned int stripe_width()
     {
-        return 8;
+        return get_vector_length<float>();
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 2;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 3, 2, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 3, 2, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_bf16fp32_dot_8x3VL;
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 15.92, 3.74, 7.14 };
+                case CPUModel::A510:
+                    return { 7.54, 3.77, 2.43 };
+                case CPUModel::V1:
+                    return { 31.82, 5.11, 11.20 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_bf16fp32_dot_8x3VL;
     cls_sve_interleaved_bf16fp32_dot_8x3VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
index 4f774b133f..e604dcc4bc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,320 +10,237 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #ifdef ARM_COMPUTE_ENABLE_SVE
 
+#include <cstddef>
 #include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const bfloat16 *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
+void sve_interleaved_bf16fp32_dot_8x3VL(
+    const bfloat16 *Apanel, const bfloat16 *Bpanel,
+    float *Cpanel, int ablocks, int bblocks, int K) {
 
-    K /= 2;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const bfloat16 *a_ptr0 = a_ptr;
-        const bfloat16 *b_ptr = Bpanel;
+    ka.bblocks = bblocks;
+    ka.K = (K/2) - 1;
+    ka.Bpanel = Bpanel;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.h\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z17.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "mov z18.s, #0\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
-                ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
-                ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n"
-                ".inst 0x64714096 // bfdot z22.s, z4.h, z1.h[2]\n"
-                ".inst 0x64794097 // bfdot z23.s, z4.h, z1.h[3]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
-                ".inst 0x646040ac // bfdot z12.s, z5.h, z0.h[0]\n"
-                ".inst 0x646840ad // bfdot z13.s, z5.h, z0.h[1]\n"
-                ".inst 0x647040ae // bfdot z14.s, z5.h, z0.h[2]\n"
-                ".inst 0x647840af // bfdot z15.s, z5.h, z0.h[3]\n"
-                ".inst 0x646140b8 // bfdot z24.s, z5.h, z1.h[0]\n"
-                ".inst 0x646940b9 // bfdot z25.s, z5.h, z1.h[1]\n"
-                ".inst 0x647140ba // bfdot z26.s, z5.h, z1.h[2]\n"
-                ".inst 0x647940bb // bfdot z27.s, z5.h, z1.h[3]\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x646040d0 // bfdot z16.s, z6.h, z0.h[0]\n"
-                ".inst 0x646840d1 // bfdot z17.s, z6.h, z0.h[1]\n"
-                ".inst 0x647040d2 // bfdot z18.s, z6.h, z0.h[2]\n"
-                ".inst 0x647840d3 // bfdot z19.s, z6.h, z0.h[3]\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                ".inst 0x646140dc // bfdot z28.s, z6.h, z1.h[0]\n"
-                ".inst 0x646940dd // bfdot z29.s, z6.h, z1.h[1]\n"
-                ".inst 0x647140de // bfdot z30.s, z6.h, z1.h[2]\n"
-                ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x64624088 // bfdot z8.s, z4.h, z2.h[0]\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x646a4089 // bfdot z9.s, z4.h, z2.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x6472408a // bfdot z10.s, z4.h, z2.h[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                ".inst 0x647a408b // bfdot z11.s, z4.h, z2.h[3]\n"
-                ".inst 0x64634094 // bfdot z20.s, z4.h, z3.h[0]\n"
-                ".inst 0x646b4095 // bfdot z21.s, z4.h, z3.h[1]\n"
-                ".inst 0x64734096 // bfdot z22.s, z4.h, z3.h[2]\n"
-                ".inst 0x647b4097 // bfdot z23.s, z4.h, z3.h[3]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x646240ac // bfdot z12.s, z5.h, z2.h[0]\n"
-                ".inst 0x646a40ad // bfdot z13.s, z5.h, z2.h[1]\n"
-                ".inst 0x647240ae // bfdot z14.s, z5.h, z2.h[2]\n"
-                ".inst 0x647a40af // bfdot z15.s, z5.h, z2.h[3]\n"
-                ".inst 0x646340b8 // bfdot z24.s, z5.h, z3.h[0]\n"
-                ".inst 0x646b40b9 // bfdot z25.s, z5.h, z3.h[1]\n"
-                ".inst 0x647340ba // bfdot z26.s, z5.h, z3.h[2]\n"
-                ".inst 0x647b40bb // bfdot z27.s, z5.h, z3.h[3]\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
-                ".inst 0x646a40d1 // bfdot z17.s, z6.h, z2.h[1]\n"
-                ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
-                ".inst 0x647a40d3 // bfdot z19.s, z6.h, z2.h[3]\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x646340dc // bfdot z28.s, z6.h, z3.h[0]\n"
-                ".inst 0x646b40dd // bfdot z29.s, z6.h, z3.h[1]\n"
-                ".inst 0x647340de // bfdot z30.s, z6.h, z3.h[2]\n"
-                ".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
-                ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
-                ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
-                ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n"
-                ".inst 0x64714096 // bfdot z22.s, z4.h, z1.h[2]\n"
-                ".inst 0x64794097 // bfdot z23.s, z4.h, z1.h[3]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
-                ".inst 0x646040ac // bfdot z12.s, z5.h, z0.h[0]\n"
-                ".inst 0x646840ad // bfdot z13.s, z5.h, z0.h[1]\n"
-                ".inst 0x647040ae // bfdot z14.s, z5.h, z0.h[2]\n"
-                ".inst 0x647840af // bfdot z15.s, z5.h, z0.h[3]\n"
-                ".inst 0x646140b8 // bfdot z24.s, z5.h, z1.h[0]\n"
-                ".inst 0x646940b9 // bfdot z25.s, z5.h, z1.h[1]\n"
-                ".inst 0x647140ba // bfdot z26.s, z5.h, z1.h[2]\n"
-                ".inst 0x647940bb // bfdot z27.s, z5.h, z1.h[3]\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x646040d0 // bfdot z16.s, z6.h, z0.h[0]\n"
-                ".inst 0x646840d1 // bfdot z17.s, z6.h, z0.h[1]\n"
-                ".inst 0x647040d2 // bfdot z18.s, z6.h, z0.h[2]\n"
-                ".inst 0x647840d3 // bfdot z19.s, z6.h, z0.h[3]\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                ".inst 0x646140dc // bfdot z28.s, z6.h, z1.h[0]\n"
-                ".inst 0x646940dd // bfdot z29.s, z6.h, z1.h[1]\n"
-                ".inst 0x647140de // bfdot z30.s, z6.h, z1.h[2]\n"
-                ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x64624088 // bfdot z8.s, z4.h, z2.h[0]\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x646a4089 // bfdot z9.s, z4.h, z2.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                ".inst 0x6472408a // bfdot z10.s, z4.h, z2.h[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                ".inst 0x647a408b // bfdot z11.s, z4.h, z2.h[3]\n"
-                ".inst 0x64634094 // bfdot z20.s, z4.h, z3.h[0]\n"
-                ".inst 0x646b4095 // bfdot z21.s, z4.h, z3.h[1]\n"
-                ".inst 0x64734096 // bfdot z22.s, z4.h, z3.h[2]\n"
-                ".inst 0x647b4097 // bfdot z23.s, z4.h, z3.h[3]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x646240ac // bfdot z12.s, z5.h, z2.h[0]\n"
-                ".inst 0x646a40ad // bfdot z13.s, z5.h, z2.h[1]\n"
-                ".inst 0x647240ae // bfdot z14.s, z5.h, z2.h[2]\n"
-                ".inst 0x647a40af // bfdot z15.s, z5.h, z2.h[3]\n"
-                ".inst 0x646340b8 // bfdot z24.s, z5.h, z3.h[0]\n"
-                ".inst 0x646b40b9 // bfdot z25.s, z5.h, z3.h[1]\n"
-                ".inst 0x647340ba // bfdot z26.s, z5.h, z3.h[2]\n"
-                ".inst 0x647b40bb // bfdot z27.s, z5.h, z3.h[3]\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
-                ".inst 0x646a40d1 // bfdot z17.s, z6.h, z2.h[1]\n"
-                ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
-                ".inst 0x647a40d3 // bfdot z19.s, z6.h, z2.h[3]\n"
-                ".inst 0x646340dc // bfdot z28.s, z6.h, z3.h[0]\n"
-                ".inst 0x646b40dd // bfdot z29.s, z6.h, z3.h[1]\n"
-                ".inst 0x647340de // bfdot z30.s, z6.h, z3.h[2]\n"
-                ".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
-                ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
-                ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
-                ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
-                ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n"
-                ".inst 0x64714096 // bfdot z22.s, z4.h, z1.h[2]\n"
-                ".inst 0x64794097 // bfdot z23.s, z4.h, z1.h[3]\n"
-                ".inst 0x646040ac // bfdot z12.s, z5.h, z0.h[0]\n"
-                ".inst 0x646840ad // bfdot z13.s, z5.h, z0.h[1]\n"
-                ".inst 0x647040ae // bfdot z14.s, z5.h, z0.h[2]\n"
-                ".inst 0x647840af // bfdot z15.s, z5.h, z0.h[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                ".inst 0x646140b8 // bfdot z24.s, z5.h, z1.h[0]\n"
-                ".inst 0x646940b9 // bfdot z25.s, z5.h, z1.h[1]\n"
-                ".inst 0x647140ba // bfdot z26.s, z5.h, z1.h[2]\n"
-                ".inst 0x647940bb // bfdot z27.s, z5.h, z1.h[3]\n"
-                ".inst 0x646040d0 // bfdot z16.s, z6.h, z0.h[0]\n"
-                ".inst 0x646840d1 // bfdot z17.s, z6.h, z0.h[1]\n"
-                ".inst 0x647040d2 // bfdot z18.s, z6.h, z0.h[2]\n"
-                ".inst 0x647840d3 // bfdot z19.s, z6.h, z0.h[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                ".inst 0x646140dc // bfdot z28.s, z6.h, z1.h[0]\n"
-                ".inst 0x646940dd // bfdot z29.s, z6.h, z1.h[1]\n"
-                ".inst 0x647140de // bfdot z30.s, z6.h, z1.h[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
-                ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
-                ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n"
-                ".inst 0x64714096 // bfdot z22.s, z4.h, z1.h[2]\n"
-                ".inst 0x64794097 // bfdot z23.s, z4.h, z1.h[3]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x646040ac // bfdot z12.s, z5.h, z0.h[0]\n"
-                ".inst 0x646840ad // bfdot z13.s, z5.h, z0.h[1]\n"
-                ".inst 0x647040ae // bfdot z14.s, z5.h, z0.h[2]\n"
-                ".inst 0x647840af // bfdot z15.s, z5.h, z0.h[3]\n"
-                ".inst 0x646140b8 // bfdot z24.s, z5.h, z1.h[0]\n"
-                ".inst 0x646940b9 // bfdot z25.s, z5.h, z1.h[1]\n"
-                ".inst 0x647140ba // bfdot z26.s, z5.h, z1.h[2]\n"
-                ".inst 0x647940bb // bfdot z27.s, z5.h, z1.h[3]\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x646040d0 // bfdot z16.s, z6.h, z0.h[0]\n"
-                ".inst 0x646840d1 // bfdot z17.s, z6.h, z0.h[1]\n"
-                ".inst 0x647040d2 // bfdot z18.s, z6.h, z0.h[2]\n"
-                ".inst 0x647840d3 // bfdot z19.s, z6.h, z0.h[3]\n"
-                ".inst 0x646140dc // bfdot z28.s, z6.h, z1.h[0]\n"
-                ".inst 0x646940dd // bfdot z29.s, z6.h, z1.h[1]\n"
-                ".inst 0x647140de // bfdot z30.s, z6.h, z1.h[2]\n"
-                ".inst 0x647940df // bfdot z31.s, z6.h, z1.h[3]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x64624088 // bfdot z8.s, z4.h, z2.h[0]\n"
-                ".inst 0x646a4089 // bfdot z9.s, z4.h, z2.h[1]\n"
-                ".inst 0x6472408a // bfdot z10.s, z4.h, z2.h[2]\n"
-                ".inst 0x647a408b // bfdot z11.s, z4.h, z2.h[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                ".inst 0x64634094 // bfdot z20.s, z4.h, z3.h[0]\n"
-                ".inst 0x646b4095 // bfdot z21.s, z4.h, z3.h[1]\n"
-                ".inst 0x64734096 // bfdot z22.s, z4.h, z3.h[2]\n"
-                ".inst 0x647b4097 // bfdot z23.s, z4.h, z3.h[3]\n"
-                ".inst 0x646240ac // bfdot z12.s, z5.h, z2.h[0]\n"
-                ".inst 0x646a40ad // bfdot z13.s, z5.h, z2.h[1]\n"
-                ".inst 0x647240ae // bfdot z14.s, z5.h, z2.h[2]\n"
-                ".inst 0x647a40af // bfdot z15.s, z5.h, z2.h[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                ".inst 0x646340b8 // bfdot z24.s, z5.h, z3.h[0]\n"
-                ".inst 0x646b40b9 // bfdot z25.s, z5.h, z3.h[1]\n"
-                ".inst 0x647340ba // bfdot z26.s, z5.h, z3.h[2]\n"
-                ".inst 0x647b40bb // bfdot z27.s, z5.h, z3.h[3]\n"
-                ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
-                ".inst 0x646a40d1 // bfdot z17.s, z6.h, z2.h[1]\n"
-                ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
-                ".inst 0x647a40d3 // bfdot z19.s, z6.h, z2.h[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                ".inst 0x646340dc // bfdot z28.s, z6.h, z3.h[0]\n"
-                ".inst 0x646b40dd // bfdot z29.s, z6.h, z3.h[1]\n"
-                ".inst 0x647340de // bfdot z30.s, z6.h, z3.h[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                ".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n"
-                "4:\n"
-                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "st1w z25.s, p0, [%[c_ptr]]\n"
-                "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "ld1h { z4.h }, p0/Z, [x20]\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov %x[Apanel], x21\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "cmp x19, #0x2\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      ".inst 0x64604088  // bfdot z8.s, z4.h, z0.h[0]\n"
+      ".inst 0x6468408b  // bfdot z11.s, z4.h, z0.h[1]\n"
+      "ld1h { z5.h }, p0/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x6470408e  // bfdot z14.s, z4.h, z0.h[2]\n"
+      ".inst 0x64784091  // bfdot z17.s, z4.h, z0.h[3]\n"
+      "ld1h { z6.h }, p0/Z, [x20, #2, MUL VL]\n"
+      ".inst 0x64614094  // bfdot z20.s, z4.h, z1.h[0]\n"
+      ".inst 0x64694097  // bfdot z23.s, z4.h, z1.h[1]\n"
+      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
+      ".inst 0x6471409a  // bfdot z26.s, z4.h, z1.h[2]\n"
+      ".inst 0x6479409d  // bfdot z29.s, z4.h, z1.h[3]\n"
+      "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x646040a9  // bfdot z9.s, z5.h, z0.h[0]\n"
+      ".inst 0x646840ac  // bfdot z12.s, z5.h, z0.h[1]\n"
+      "ld1h { z4.h }, p0/Z, [x20, #3, MUL VL]\n"
+      ".inst 0x647040af  // bfdot z15.s, z5.h, z0.h[2]\n"
+      ".inst 0x647840b2  // bfdot z18.s, z5.h, z0.h[3]\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x646140b5  // bfdot z21.s, z5.h, z1.h[0]\n"
+      ".inst 0x646940b8  // bfdot z24.s, z5.h, z1.h[1]\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x647140bb  // bfdot z27.s, z5.h, z1.h[2]\n"
+      ".inst 0x647940be  // bfdot z30.s, z5.h, z1.h[3]\n"
+      "ld1h { z5.h }, p0/Z, [x20, #4, MUL VL]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646840cd  // bfdot z13.s, z6.h, z0.h[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x647040d0  // bfdot z16.s, z6.h, z0.h[2]\n"
+      ".inst 0x647840d3  // bfdot z19.s, z6.h, z0.h[3]\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x646140d6  // bfdot z22.s, z6.h, z1.h[0]\n"
+      ".inst 0x646940d9  // bfdot z25.s, z6.h, z1.h[1]\n"
+      ".inst 0x647140dc  // bfdot z28.s, z6.h, z1.h[2]\n"
+      ".inst 0x647940df  // bfdot z31.s, z6.h, z1.h[3]\n"
+      "ld1h { z6.h }, p0/Z, [x20, #5, MUL VL]\n"
+      ".inst 0x64624088  // bfdot z8.s, z4.h, z2.h[0]\n"
+      ".inst 0x646a408b  // bfdot z11.s, z4.h, z2.h[1]\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+      ".inst 0x6472408e  // bfdot z14.s, z4.h, z2.h[2]\n"
+      ".inst 0x647a4091  // bfdot z17.s, z4.h, z2.h[3]\n"
+      "addvl x20, x20, #6\n"
+      ".inst 0x64634094  // bfdot z20.s, z4.h, z3.h[0]\n"
+      ".inst 0x646b4097  // bfdot z23.s, z4.h, z3.h[1]\n"
+      ".inst 0x6473409a  // bfdot z26.s, z4.h, z3.h[2]\n"
+      ".inst 0x647b409d  // bfdot z29.s, z4.h, z3.h[3]\n"
+      ".inst 0x646240a9  // bfdot z9.s, z5.h, z2.h[0]\n"
+      ".inst 0x646a40ac  // bfdot z12.s, z5.h, z2.h[1]\n"
+      "ld1h { z4.h }, p0/Z, [x20]\n"
+      ".inst 0x647240af  // bfdot z15.s, z5.h, z2.h[2]\n"
+      ".inst 0x647a40b2  // bfdot z18.s, z5.h, z2.h[3]\n"
+      ".inst 0x646340b5  // bfdot z21.s, z5.h, z3.h[0]\n"
+      ".inst 0x646b40b8  // bfdot z24.s, z5.h, z3.h[1]\n"
+      ".inst 0x647340bb  // bfdot z27.s, z5.h, z3.h[2]\n"
+      ".inst 0x647b40be  // bfdot z30.s, z5.h, z3.h[3]\n"
+      ".inst 0x646240ca  // bfdot z10.s, z6.h, z2.h[0]\n"
+      ".inst 0x646a40cd  // bfdot z13.s, z6.h, z2.h[1]\n"
+      ".inst 0x647240d0  // bfdot z16.s, z6.h, z2.h[2]\n"
+      ".inst 0x647a40d3  // bfdot z19.s, z6.h, z2.h[3]\n"
+      ".inst 0x646340d6  // bfdot z22.s, z6.h, z3.h[0]\n"
+      ".inst 0x646b40d9  // bfdot z25.s, z6.h, z3.h[1]\n"
+      ".inst 0x647340dc  // bfdot z28.s, z6.h, z3.h[2]\n"
+      ".inst 0x647b40df  // bfdot z31.s, z6.h, z3.h[3]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      ".inst 0x64604088  // bfdot z8.s, z4.h, z0.h[0]\n"
+      ".inst 0x6468408b  // bfdot z11.s, z4.h, z0.h[1]\n"
+      "ld1h { z5.h }, p0/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x6470408e  // bfdot z14.s, z4.h, z0.h[2]\n"
+      ".inst 0x64784091  // bfdot z17.s, z4.h, z0.h[3]\n"
+      "ld1h { z6.h }, p0/Z, [x20, #2, MUL VL]\n"
+      ".inst 0x64614094  // bfdot z20.s, z4.h, z1.h[0]\n"
+      ".inst 0x64694097  // bfdot z23.s, z4.h, z1.h[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      ".inst 0x6471409a  // bfdot z26.s, z4.h, z1.h[2]\n"
+      ".inst 0x6479409d  // bfdot z29.s, z4.h, z1.h[3]\n"
+      "addvl x20, x20, #3\n"
+      ".inst 0x646040a9  // bfdot z9.s, z5.h, z0.h[0]\n"
+      ".inst 0x646840ac  // bfdot z12.s, z5.h, z0.h[1]\n"
+      ".inst 0x647040af  // bfdot z15.s, z5.h, z0.h[2]\n"
+      ".inst 0x647840b2  // bfdot z18.s, z5.h, z0.h[3]\n"
+      ".inst 0x646140b5  // bfdot z21.s, z5.h, z1.h[0]\n"
+      ".inst 0x646940b8  // bfdot z24.s, z5.h, z1.h[1]\n"
+      ".inst 0x647140bb  // bfdot z27.s, z5.h, z1.h[2]\n"
+      ".inst 0x647940be  // bfdot z30.s, z5.h, z1.h[3]\n"
+      ".inst 0x646040ca  // bfdot z10.s, z6.h, z0.h[0]\n"
+      ".inst 0x646840cd  // bfdot z13.s, z6.h, z0.h[1]\n"
+      ".inst 0x647040d0  // bfdot z16.s, z6.h, z0.h[2]\n"
+      ".inst 0x647840d3  // bfdot z19.s, z6.h, z0.h[3]\n"
+      ".inst 0x646140d6  // bfdot z22.s, z6.h, z1.h[0]\n"
+      ".inst 0x646940d9  // bfdot z25.s, z6.h, z1.h[1]\n"
+      ".inst 0x647140dc  // bfdot z28.s, z6.h, z1.h[2]\n"
+      ".inst 0x647940df  // bfdot z31.s, z6.h, z1.h[3]\n"
+      "cbz x19, 5f\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ld1h { z7.h }, p0/Z, [x20]\n"
+      "ld1h { z4.h }, p0/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z5.h }, p0/Z, [x20, #2, MUL VL]\n"
+      "addvl x20, x20, #3\n"
+      ".inst 0x646040e8  // bfdot z8.s, z7.h, z0.h[0]\n"
+      ".inst 0x646840eb  // bfdot z11.s, z7.h, z0.h[1]\n"
+      ".inst 0x647040ee  // bfdot z14.s, z7.h, z0.h[2]\n"
+      ".inst 0x647840f1  // bfdot z17.s, z7.h, z0.h[3]\n"
+      ".inst 0x646140f4  // bfdot z20.s, z7.h, z1.h[0]\n"
+      ".inst 0x646940f7  // bfdot z23.s, z7.h, z1.h[1]\n"
+      ".inst 0x647140fa  // bfdot z26.s, z7.h, z1.h[2]\n"
+      ".inst 0x647940fd  // bfdot z29.s, z7.h, z1.h[3]\n"
+      ".inst 0x64604089  // bfdot z9.s, z4.h, z0.h[0]\n"
+      ".inst 0x6468408c  // bfdot z12.s, z4.h, z0.h[1]\n"
+      ".inst 0x6470408f  // bfdot z15.s, z4.h, z0.h[2]\n"
+      ".inst 0x64784092  // bfdot z18.s, z4.h, z0.h[3]\n"
+      ".inst 0x64614095  // bfdot z21.s, z4.h, z1.h[0]\n"
+      ".inst 0x64694098  // bfdot z24.s, z4.h, z1.h[1]\n"
+      ".inst 0x6471409b  // bfdot z27.s, z4.h, z1.h[2]\n"
+      ".inst 0x6479409e  // bfdot z30.s, z4.h, z1.h[3]\n"
+      ".inst 0x646040aa  // bfdot z10.s, z5.h, z0.h[0]\n"
+      ".inst 0x646840ad  // bfdot z13.s, z5.h, z0.h[1]\n"
+      ".inst 0x647040b0  // bfdot z16.s, z5.h, z0.h[2]\n"
+      ".inst 0x647840b3  // bfdot z19.s, z5.h, z0.h[3]\n"
+      ".inst 0x646140b6  // bfdot z22.s, z5.h, z1.h[0]\n"
+      ".inst 0x646940b9  // bfdot z25.s, z5.h, z1.h[1]\n"
+      ".inst 0x647140bc  // bfdot z28.s, z5.h, z1.h[2]\n"
+      ".inst 0x647940bf  // bfdot z31.s, z5.h, z1.h[3]\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x22, x22, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
index b7fc515341..fa44bdbd31 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,63 +10,104 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
 
 #ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include "../bfloat.hpp"
 #include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const bfloat16 *, const bfloat16 *, \
+    float *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void sve_interleaved_bf16fp32_mmla_8x3VL( ARGLIST );
 
-class cls_sve_interleaved_bf16fp32_mmla_8x3VL {
+class cls_sve_interleaved_bf16fp32_mmla_8x3VL
+{
 public:
     typedef bfloat16 operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
     static unsigned int out_width()
     {
         return get_vector_length<float>() * 3;
     }
 
-    static unsigned int out_height()
+    static unsigned int stripe_width()
     {
-        return 8;
+        return get_vector_length<float>();
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 4;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
+
+        if (std::is_same<T, bfloat16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.41, 4.30, 7.14 };
+                case CPUModel::A510:
+                    return { 7.78, 4.01, 2.43 };
+                case CPUModel::V1:
+                    return { 62.50, 5.09, 11.32 };
+            }
+        }
 
-    kern_type kernel=sve_interleaved_bf16fp32_mmla_8x3VL;
 
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 30.86, 2.36, 5.28 };
+                case CPUModel::A510:
+                    return { 7.75, 2.47, 2.39 };
+                case CPUModel::V1:
+                    return { 60.83, 2.69, 8.66 };
+            }
+        }
+
+        return { 1.0 };
+    }
+
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_bf16fp32_mmla_8x3VL;
     cls_sve_interleaved_bf16fp32_mmla_8x3VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
index c720942140..de4f0ad313 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,388 +10,284 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #ifdef ARM_COMPUTE_ENABLE_SVE
 
+#include <cstddef>
 #include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const bfloat16 *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
+void sve_interleaved_bf16fp32_mmla_8x3VL(
+    const bfloat16 *Apanel, const bfloat16 *Bpanel,
+    float *Cpanel, int ablocks, int bblocks, int K) {
 
-    K /= 4;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const bfloat16 *Bpanel = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const bfloat16 *a_ptr0 = a_ptr;
-        const bfloat16 *b_ptr = Bpanel;
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.h\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "mov z17.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z18.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #4\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
-                ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
-                ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
-                ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
-                ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
-                ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n"
-                ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
-                ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
-                ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
-                ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
-                ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
-                ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n"
-                ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
-                ".inst 0x6464e47e // bfmmla z30.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #0x20]\n"
-                ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #5, MUL VL]\n"
-                ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #0x30]\n"
-                ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
-                "addvl %[b_ptr], %[b_ptr], #12\n"
-                ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n"
-                ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
-                ".inst 0x6467e42f // bfmmla z15.s, z1.h, z7.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
-                ".inst 0x6467e47b // bfmmla z27.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
-                ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n"
-                ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
-                ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
-                ".inst 0x6465e431 // bfmmla z17.s, z1.h, z5.h\n"
-                ".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
-                ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
-                ".inst 0x6466e432 // bfmmla z18.s, z1.h, z6.h\n"
-                ".inst 0x6466e458 // bfmmla z24.s, z2.h, z6.h\n"
-                ".inst 0x6466e47e // bfmmla z30.s, z3.h, z6.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
-                ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
-                ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
-                ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
-                ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
-                ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
-                ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n"
-                ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
-                ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
-                ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
-                ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
-                ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
-                ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n"
-                ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
-                ".inst 0x6464e47e // bfmmla z30.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #0x20]\n"
-                ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #5, MUL VL]\n"
-                ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #0x30]\n"
-                ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
-                "addvl %[b_ptr], %[b_ptr], #14\n"
-                ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n"
-                ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
-                ".inst 0x6467e42f // bfmmla z15.s, z1.h, z7.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-8, MUL VL]\n"
-                ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
-                ".inst 0x6467e47b // bfmmla z27.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-7, MUL VL]\n"
-                ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
-                ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n"
-                ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
-                ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
-                ".inst 0x6465e431 // bfmmla z17.s, z1.h, z5.h\n"
-                ".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
-                ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
-                ".inst 0x6466e432 // bfmmla z18.s, z1.h, z6.h\n"
-                ".inst 0x6466e458 // bfmmla z24.s, z2.h, z6.h\n"
-                ".inst 0x6466e47e // bfmmla z30.s, z3.h, z6.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
-                ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
-                ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
-                ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
-                ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
-                ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
-                ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n"
-                ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
-                ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n"
-                "uzp1 z6.d, z14.d, z15.d\n"
-                ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
-                ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
-                ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
-                ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n"
-                ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
-                "uzp1 z7.d, z16.d, z17.d\n"
-                ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n"
-                ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
-                ".inst 0x6464e47e // bfmmla z30.s, z3.h, z4.h\n"
-                "uzp2 z4.d, z10.d, z11.d\n"
-                ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
-                "uzp1 z0.d, z8.d, z9.d\n"
-                ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
-                "uzp1 z1.d, z10.d, z11.d\n"
-                ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                "uzp1 z2.d, z12.d, z13.d\n"
-                "uzp1 z0.d, z18.d, z19.d\n"
-                ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "uzp2 z3.d, z8.d, z9.d\n"
-                "uzp2 z5.d, z12.d, z13.d\n"
-                "uzp2 z1.d, z14.d, z15.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
-                "addvl %[b_ptr], %[b_ptr], #8\n"
-                ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
-                ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
-                ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-8, MUL VL]\n"
-                ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-7, MUL VL]\n"
-                ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"
-                ".inst 0x6466e430 // bfmmla z16.s, z1.h, z6.h\n"
-                ".inst 0x6466e456 // bfmmla z22.s, z2.h, z6.h\n"
-                ".inst 0x6466e47c // bfmmla z28.s, z3.h, z6.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x6467e40b // bfmmla z11.s, z0.h, z7.h\n"
-                ".inst 0x6467e431 // bfmmla z17.s, z1.h, z7.h\n"
-                ".inst 0x6467e457 // bfmmla z23.s, z2.h, z7.h\n"
-                ".inst 0x6467e47d // bfmmla z29.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x6464e40c // bfmmla z12.s, z0.h, z4.h\n"
-                ".inst 0x6464e432 // bfmmla z18.s, z1.h, z4.h\n"
-                ".inst 0x6464e458 // bfmmla z24.s, z2.h, z4.h\n"
-                ".inst 0x6464e47e // bfmmla z30.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x6465e40d // bfmmla z13.s, z0.h, z5.h\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x6465e433 // bfmmla z19.s, z1.h, z5.h\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x6465e459 // bfmmla z25.s, z2.h, z5.h\n"
-                "ld1rqh z2.h, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x6465e47f // bfmmla z31.s, z3.h, z5.h\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x6466e408 // bfmmla z8.s, z0.h, z6.h\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x6466e42e // bfmmla z14.s, z1.h, z6.h\n"
-                ".inst 0x6466e454 // bfmmla z20.s, z2.h, z6.h\n"
-                ".inst 0x6467e409 // bfmmla z9.s, z0.h, z7.h\n"
-                ".inst 0x6466e47a // bfmmla z26.s, z3.h, z6.h\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x6467e42f // bfmmla z15.s, z1.h, z7.h\n"
-                ".inst 0x6467e455 // bfmmla z21.s, z2.h, z7.h\n"
-                ".inst 0x6467e47b // bfmmla z27.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x6464e40a // bfmmla z10.s, z0.h, z4.h\n"
-                ".inst 0x6464e430 // bfmmla z16.s, z1.h, z4.h\n"
-                ".inst 0x6464e456 // bfmmla z22.s, z2.h, z4.h\n"
-                ".inst 0x6464e47c // bfmmla z28.s, z3.h, z4.h\n"
-                ".inst 0x6465e40b // bfmmla z11.s, z0.h, z5.h\n"
-                ".inst 0x6465e431 // bfmmla z17.s, z1.h, z5.h\n"
-                ".inst 0x6465e457 // bfmmla z23.s, z2.h, z5.h\n"
-                ".inst 0x6465e47d // bfmmla z29.s, z3.h, z5.h\n"
-                "uzp2 z4.d, z10.d, z11.d\n"
-                ".inst 0x6466e40c // bfmmla z12.s, z0.h, z6.h\n"
-                ".inst 0x6466e432 // bfmmla z18.s, z1.h, z6.h\n"
-                ".inst 0x6466e458 // bfmmla z24.s, z2.h, z6.h\n"
-                ".inst 0x6466e47e // bfmmla z30.s, z3.h, z6.h\n"
-                "uzp1 z6.d, z14.d, z15.d\n"
-                ".inst 0x6467e40d // bfmmla z13.s, z0.h, z7.h\n"
-                "uzp1 z0.d, z8.d, z9.d\n"
-                ".inst 0x6467e433 // bfmmla z19.s, z1.h, z7.h\n"
-                "uzp1 z1.d, z10.d, z11.d\n"
-                "uzp2 z5.d, z12.d, z13.d\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
-                "uzp1 z2.d, z12.d, z13.d\n"
-                "uzp1 z0.d, z18.d, z19.d\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "uzp2 z1.d, z14.d, z15.d\n"
-                ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n"
-                "uzp2 z3.d, z8.d, z9.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "uzp1 z7.d, z16.d, z17.d\n"
-                "4:\n"
-                "uzp2 z2.d, z16.d, z17.d\n"
-                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "uzp2 z3.d, z18.d, z19.d\n"
-                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "uzp1 z4.d, z20.d, z21.d\n"
-                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "uzp1 z5.d, z22.d, z23.d\n"
-                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "uzp1 z6.d, z24.d, z25.d\n"
-                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "uzp2 z7.d, z20.d, z21.d\n"
-                "st1w z0.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "uzp2 z0.d, z22.d, z23.d\n"
-                "st1w z1.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "uzp2 z1.d, z24.d, z25.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "uzp1 z2.d, z26.d, z27.d\n"
-                "st1w z3.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "uzp1 z3.d, z28.d, z29.d\n"
-                "st1w z4.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "uzp1 z4.d, z30.d, z31.d\n"
-                "st1w z5.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "uzp2 z5.d, z26.d, z27.d\n"
-                "st1w z6.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "uzp2 z6.d, z28.d, z29.d\n"
-                "st1w z7.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "uzp2 z7.d, z30.d, z31.d\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "ld1h { z4.h }, p0/Z, [x20]\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "ld1h { z5.h }, p0/Z, [x20, #1, MUL VL]\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "addvl x20, x20, #2\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      ".inst 0x6464e42e  // bfmmla z14.s, z1.h, z4.h\n"
+      ".inst 0x6465e40b  // bfmmla z11.s, z0.h, z5.h\n"
+      ".inst 0x6465e431  // bfmmla z17.s, z1.h, z5.h\n"
+      "ld1h { z6.h }, p0/Z, [x20]\n"
+      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
+      ".inst 0x6465e457  // bfmmla z23.s, z2.h, z5.h\n"
+      "ld1h { z7.h }, p0/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x6464e47a  // bfmmla z26.s, z3.h, z4.h\n"
+      ".inst 0x6465e47d  // bfmmla z29.s, z3.h, z5.h\n"
+      "ld1h { z4.h }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z5.h }, p0/Z, [x20, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6466e47b  // bfmmla z27.s, z3.h, z6.h\n"
+      "ld1h { z6.h }, p0/Z, [x20, #4, MUL VL]\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x6465e40d  // bfmmla z13.s, z0.h, z5.h\n"
+      ".inst 0x6467e432  // bfmmla z18.s, z1.h, z7.h\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #16]\n"
+      ".inst 0x6464e430  // bfmmla z16.s, z1.h, z4.h\n"
+      ".inst 0x6465e433  // bfmmla z19.s, z1.h, z5.h\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #32]\n"
+      ".inst 0x6467e458  // bfmmla z24.s, z2.h, z7.h\n"
+      ".inst 0x6467e47e  // bfmmla z30.s, z3.h, z7.h\n"
+      "ld1h { z7.h }, p0/Z, [x20, #5, MUL VL]\n"
+      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
+      ".inst 0x6465e459  // bfmmla z25.s, z2.h, z5.h\n"
+      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x6464e47c  // bfmmla z28.s, z3.h, z4.h\n"
+      ".inst 0x6465e47f  // bfmmla z31.s, z3.h, z5.h\n"
+      "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #64]\n"
+      "ld1h { z4.h }, p0/Z, [x20, #6, MUL VL]\n"
+      "ld1h { z5.h }, p0/Z, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6467e431  // bfmmla z17.s, z1.h, z7.h\n"
+      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      ".inst 0x6466e47a  // bfmmla z26.s, z3.h, z6.h\n"
+      ".inst 0x6467e47d  // bfmmla z29.s, z3.h, z7.h\n"
+      "ld1h { z6.h }, p0/Z, [x20, #-8, MUL VL]\n"
+      "ld1h { z7.h }, p0/Z, [x20, #-7, MUL VL]\n"
+      ".inst 0x6464e409  // bfmmla z9.s, z0.h, z4.h\n"
+      ".inst 0x6464e42f  // bfmmla z15.s, z1.h, z4.h\n"
+      ".inst 0x6464e455  // bfmmla z21.s, z2.h, z4.h\n"
+      ".inst 0x6464e47b  // bfmmla z27.s, z3.h, z4.h\n"
+      "ld1h { z4.h }, p0/Z, [x20, #-6, MUL VL]\n"
+      ".inst 0x6465e40c  // bfmmla z12.s, z0.h, z5.h\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      ".inst 0x6465e432  // bfmmla z18.s, z1.h, z5.h\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel], #80]\n"
+      ".inst 0x6466e430  // bfmmla z16.s, z1.h, z6.h\n"
+      ".inst 0x6467e433  // bfmmla z19.s, z1.h, z7.h\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #96]\n"
+      ".inst 0x6465e458  // bfmmla z24.s, z2.h, z5.h\n"
+      ".inst 0x6465e47e  // bfmmla z30.s, z3.h, z5.h\n"
+      "ld1h { z5.h }, p0/Z, [x20, #-5, MUL VL]\n"
+      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
+      ".inst 0x6467e459  // bfmmla z25.s, z2.h, z7.h\n"
+      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #112]\n"
+      ".inst 0x6466e47c  // bfmmla z28.s, z3.h, z6.h\n"
+      ".inst 0x6467e47f  // bfmmla z31.s, z3.h, z7.h\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "addvl x20, x20, #-4\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1rqh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x6464e408  // bfmmla z8.s, z0.h, z4.h\n"
+      ".inst 0x6464e42e  // bfmmla z14.s, z1.h, z4.h\n"
+      ".inst 0x6465e40b  // bfmmla z11.s, z0.h, z5.h\n"
+      ".inst 0x6465e431  // bfmmla z17.s, z1.h, z5.h\n"
+      "ld1h { z6.h }, p0/Z, [x20]\n"
+      ".inst 0x6464e454  // bfmmla z20.s, z2.h, z4.h\n"
+      ".inst 0x6465e457  // bfmmla z23.s, z2.h, z5.h\n"
+      "ld1h { z7.h }, p0/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x6464e47a  // bfmmla z26.s, z3.h, z4.h\n"
+      ".inst 0x6465e47d  // bfmmla z29.s, z3.h, z5.h\n"
+      "ld1h { z4.h }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1h { z5.h }, p0/Z, [x20, #3, MUL VL]\n"
+      ".inst 0x6466e409  // bfmmla z9.s, z0.h, z6.h\n"
+      ".inst 0x6466e42f  // bfmmla z15.s, z1.h, z6.h\n"
+      ".inst 0x6466e455  // bfmmla z21.s, z2.h, z6.h\n"
+      ".inst 0x6466e47b  // bfmmla z27.s, z3.h, z6.h\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x6467e40c  // bfmmla z12.s, z0.h, z7.h\n"
+      ".inst 0x6464e40a  // bfmmla z10.s, z0.h, z4.h\n"
+      "addvl x20, x20, #4\n"
+      ".inst 0x6465e40d  // bfmmla z13.s, z0.h, z5.h\n"
+      ".inst 0x6467e432  // bfmmla z18.s, z1.h, z7.h\n"
+      ".inst 0x6464e430  // bfmmla z16.s, z1.h, z4.h\n"
+      ".inst 0x6465e433  // bfmmla z19.s, z1.h, z5.h\n"
+      ".inst 0x6467e458  // bfmmla z24.s, z2.h, z7.h\n"
+      ".inst 0x6467e47e  // bfmmla z30.s, z3.h, z7.h\n"
+      ".inst 0x6464e456  // bfmmla z22.s, z2.h, z4.h\n"
+      ".inst 0x6465e459  // bfmmla z25.s, z2.h, z5.h\n"
+      ".inst 0x6464e47c  // bfmmla z28.s, z3.h, z4.h\n"
+      ".inst 0x6465e47f  // bfmmla z31.s, z3.h, z5.h\n"
+      "cbz x19, 5f\n"
+      "ld1h { z6.h }, p0/Z, [x20]\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x6466e408  // bfmmla z8.s, z0.h, z6.h\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1h { z7.h }, p0/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x6466e42e  // bfmmla z14.s, z1.h, z6.h\n"
+      "ld1rqh { z2.h }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqh { z3.h }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x6467e40b  // bfmmla z11.s, z0.h, z7.h\n"
+      ".inst 0x6467e431  // bfmmla z17.s, z1.h, z7.h\n"
+      ".inst 0x6466e454  // bfmmla z20.s, z2.h, z6.h\n"
+      "ld1h { z4.h }, p0/Z, [x20, #2, MUL VL]\n"
+      ".inst 0x6467e457  // bfmmla z23.s, z2.h, z7.h\n"
+      ".inst 0x6466e47a  // bfmmla z26.s, z3.h, z6.h\n"
+      "ld1h { z5.h }, p0/Z, [x20, #3, MUL VL]\n"
+      ".inst 0x6467e47d  // bfmmla z29.s, z3.h, z7.h\n"
+      "ld1h { z6.h }, p0/Z, [x20, #4, MUL VL]\n"
+      "ld1h { z7.h }, p0/Z, [x20, #5, MUL VL]\n"
+      ".inst 0x6464e409  // bfmmla z9.s, z0.h, z4.h\n"
+      ".inst 0x6464e42f  // bfmmla z15.s, z1.h, z4.h\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x6464e455  // bfmmla z21.s, z2.h, z4.h\n"
+      ".inst 0x6464e47b  // bfmmla z27.s, z3.h, z4.h\n"
+      "addvl x20, x20, #6\n"
+      ".inst 0x6465e40c  // bfmmla z12.s, z0.h, z5.h\n"
+      ".inst 0x6466e40a  // bfmmla z10.s, z0.h, z6.h\n"
+      ".inst 0x6467e40d  // bfmmla z13.s, z0.h, z7.h\n"
+      ".inst 0x6465e432  // bfmmla z18.s, z1.h, z5.h\n"
+      ".inst 0x6466e430  // bfmmla z16.s, z1.h, z6.h\n"
+      ".inst 0x6467e433  // bfmmla z19.s, z1.h, z7.h\n"
+      ".inst 0x6465e458  // bfmmla z24.s, z2.h, z5.h\n"
+      ".inst 0x6465e47e  // bfmmla z30.s, z3.h, z5.h\n"
+      ".inst 0x6466e456  // bfmmla z22.s, z2.h, z6.h\n"
+      ".inst 0x6467e459  // bfmmla z25.s, z2.h, z7.h\n"
+      ".inst 0x6466e47c  // bfmmla z28.s, z3.h, z6.h\n"
+      ".inst 0x6467e47f  // bfmmla z31.s, z3.h, z7.h\n"
+      "5:"  // multiply loop done
+      "uzp1 z4.d, z8.d, z11.d\n"
+      "uzp2 z8.d, z8.d, z11.d\n"
+      "st1w { z4.s }, p0, [%x[Cpanel]]\n"
+      "uzp1 z11.d, z9.d, z12.d\n"
+      "uzp2 z9.d, z9.d, z12.d\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "uzp1 z12.d, z10.d, z13.d\n"
+      "uzp2 z10.d, z10.d, z13.d\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "uzp1 z13.d, z14.d, z17.d\n"
+      "uzp2 z14.d, z14.d, z17.d\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "uzp1 z17.d, z15.d, z18.d\n"
+      "subs x22, x22, #0x1\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "uzp2 z15.d, z15.d, z18.d\n"
+      "uzp1 z18.d, z16.d, z19.d\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "uzp2 z16.d, z16.d, z19.d\n"
+      "uzp1 z19.d, z20.d, z23.d\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "uzp2 z20.d, z20.d, z23.d\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "uzp1 z23.d, z21.d, z24.d\n"
+      "uzp2 z21.d, z21.d, z24.d\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "uzp1 z24.d, z22.d, z25.d\n"
+      "uzp2 z22.d, z22.d, z25.d\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "uzp1 z25.d, z26.d, z29.d\n"
+      "uzp2 z26.d, z26.d, z29.d\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "uzp1 z29.d, z27.d, z30.d\n"
+      "uzp2 z27.d, z27.d, z30.d\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "uzp1 z30.d, z28.d, z31.d\n"
+      "uzp2 z28.d, z28.d, z31.d\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
index b797b8bec1..6f1089d517 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,63 +10,99 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
 
 #ifdef ARM_COMPUTE_ENABLE_SVE
-
-
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const __fp16 *, const __fp16 *, \
+    __fp16 *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_fp16_mla_8x3VL(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void sve_interleaved_fp16_mla_8x3VL( ARGLIST );
+void sve_interleaved_fp16_mla_8x3VL_a64fx( ARGLIST );
 
-class cls_sve_interleaved_fp16_mla_8x3VL {
+class cls_sve_interleaved_fp16_mla_8x3VL
+{
 public:
     typedef __fp16 operand_type;
     typedef __fp16 result_type;
 
-    typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
     static unsigned int out_width()
     {
         return get_vector_length<__fp16>() * 3;
     }
 
-    static unsigned int out_height()
+    static unsigned int stripe_width()
     {
-        return 8;
+        return get_vector_length<__fp16>();
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 1;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_fp16_mla_8x3VL;
+        if (std::is_same<T, __fp16>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 15.96, 3.85, 6.91 };
+                case CPUModel::A510:
+                    return { 13.84, 2.07, 2.52 };
+                case CPUModel::V1:
+                    return { 31.90, 5.15, 10.34 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
-    cls_sve_interleaved_fp16_mla_8x3VL(const CPUInfo *)
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_fp16_mla_8x3VL;
+    cls_sve_interleaved_fp16_mla_8x3VL(const CPUInfo *ci)
     {
-
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_interleaved_fp16_mla_8x3VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp
new file mode 100644
index 0000000000..602634706e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/a64fx.cpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void sve_interleaved_fp16_mla_8x3VL_a64fx(
+    const __fp16 *Apanel, const __fp16 *Bpanel,
+    __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const __fp16 *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "ld1h { z0.h }, p0/Z, [x20]\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "ld1h { z1.h }, p0/Z, [x20, #1, MUL VL]\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "ld1h { z2.h }, p0/Z, [x20, #2, MUL VL]\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "fmla z8.h, p0/M, z0.h, z3.h\n"
+      "fmla z9.h, p0/M, z1.h, z3.h\n"
+      "sub x19, x19, #0x2\n"
+      "fmla z10.h, p0/M, z2.h, z3.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z11.h, p0/M, z0.h, z4.h\n"
+      "fmla z12.h, p0/M, z1.h, z4.h\n"
+      "fmla z13.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n"
+      "fmla z14.h, p0/M, z0.h, z5.h\n"
+      "fmla z15.h, p0/M, z1.h, z5.h\n"
+      "cmp x19, #0x2\n"
+      "fmla z16.h, p0/M, z2.h, z5.h\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z17.h, p0/M, z0.h, z6.h\n"
+      "fmla z18.h, p0/M, z1.h, z6.h\n"
+      "fmla z19.h, p0/M, z2.h, z6.h\n"
+      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
+      "fmla z20.h, p0/M, z0.h, z3.h\n"
+      "fmla z21.h, p0/M, z1.h, z3.h\n"
+      "fmla z22.h, p0/M, z2.h, z3.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z23.h, p0/M, z0.h, z4.h\n"
+      "fmla z24.h, p0/M, z1.h, z4.h\n"
+      "fmla z25.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #18]\n"
+      "fmla z26.h, p0/M, z0.h, z5.h\n"
+      "fmla z27.h, p0/M, z1.h, z5.h\n"
+      "fmla z28.h, p0/M, z2.h, z5.h\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z29.h, p0/M, z0.h, z6.h\n"
+      "ld1h { z0.h }, p0/Z, [x20, #3, MUL VL]\n"
+      "fmla z30.h, p0/M, z1.h, z6.h\n"
+      "fmla z31.h, p0/M, z2.h, z6.h\n"
+      "ld1h { z1.h }, p0/Z, [x20, #4, MUL VL]\n"
+      "ld1h { z2.h }, p0/Z, [x20, #5, MUL VL]\n"
+      "fmla z8.h, p0/M, z0.h, z3.h\n"
+      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #22]\n"
+      "fmla z9.h, p0/M, z1.h, z3.h\n"
+      "fmla z10.h, p0/M, z2.h, z3.h\n"
+      "fmla z11.h, p0/M, z0.h, z4.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z12.h, p0/M, z1.h, z4.h\n"
+      "fmla z13.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #26]\n"
+      "fmla z14.h, p0/M, z0.h, z5.h\n"
+      "fmla z15.h, p0/M, z1.h, z5.h\n"
+      "addvl x20, x20, #6\n"
+      "fmla z16.h, p0/M, z2.h, z5.h\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z17.h, p0/M, z0.h, z6.h\n"
+      "fmla z18.h, p0/M, z1.h, z6.h\n"
+      "fmla z19.h, p0/M, z2.h, z6.h\n"
+      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #30]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z20.h, p0/M, z0.h, z3.h\n"
+      "fmla z21.h, p0/M, z1.h, z3.h\n"
+      "fmla z22.h, p0/M, z2.h, z3.h\n"
+      "fmla z23.h, p0/M, z0.h, z4.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "fmla z24.h, p0/M, z1.h, z4.h\n"
+      "fmla z25.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
+      "fmla z26.h, p0/M, z0.h, z5.h\n"
+      "fmla z27.h, p0/M, z1.h, z5.h\n"
+      "fmla z28.h, p0/M, z2.h, z5.h\n"
+      "fmla z29.h, p0/M, z0.h, z6.h\n"
+      "ld1h { z0.h }, p0/Z, [x20]\n"
+      "fmla z30.h, p0/M, z1.h, z6.h\n"
+      "fmla z31.h, p0/M, z2.h, z6.h\n"
+      "ld1h { z1.h }, p0/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z2.h }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
+      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "fmla z8.h, p0/M, z0.h, z3.h\n"
+      "fmla z9.h, p0/M, z1.h, z3.h\n"
+      "addvl x20, x20, #3\n"
+      "fmla z10.h, p0/M, z2.h, z3.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z11.h, p0/M, z0.h, z4.h\n"
+      "fmla z12.h, p0/M, z1.h, z4.h\n"
+      "fmla z13.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n"
+      "fmla z14.h, p0/M, z0.h, z5.h\n"
+      "fmla z15.h, p0/M, z1.h, z5.h\n"
+      "fmla z16.h, p0/M, z2.h, z5.h\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z17.h, p0/M, z0.h, z6.h\n"
+      "fmla z18.h, p0/M, z1.h, z6.h\n"
+      "fmla z19.h, p0/M, z2.h, z6.h\n"
+      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
+      "fmla z20.h, p0/M, z0.h, z3.h\n"
+      "fmla z21.h, p0/M, z1.h, z3.h\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla z22.h, p0/M, z2.h, z3.h\n"
+      "fmla z23.h, p0/M, z0.h, z4.h\n"
+      "fmla z24.h, p0/M, z1.h, z4.h\n"
+      "fmla z25.h, p0/M, z2.h, z4.h\n"
+      "fmla z26.h, p0/M, z0.h, z5.h\n"
+      "fmla z27.h, p0/M, z1.h, z5.h\n"
+      "fmla z28.h, p0/M, z2.h, z5.h\n"
+      "fmla z29.h, p0/M, z0.h, z6.h\n"
+      "fmla z30.h, p0/M, z1.h, z6.h\n"
+      "fmla z31.h, p0/M, z2.h, z6.h\n"
+      "cbz x19, 5f\n"
+      "ld1h { z0.h }, p0/Z, [x20]\n"
+      "ld1h { z1.h }, p0/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z2.h }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel]]\n"
+      "fmla z8.h, p0/M, z0.h, z3.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #2]\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #4]\n"
+      "fmla z9.h, p0/M, z1.h, z3.h\n"
+      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #6]\n"
+      "fmla z10.h, p0/M, z2.h, z3.h\n"
+      "fmla z11.h, p0/M, z0.h, z4.h\n"
+      "ld1rh { z3.h }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z12.h, p0/M, z1.h, z4.h\n"
+      "fmla z13.h, p0/M, z2.h, z4.h\n"
+      "ld1rh { z4.h }, p0/Z, [%x[Apanel], #10]\n"
+      "fmla z14.h, p0/M, z0.h, z5.h\n"
+      "fmla z15.h, p0/M, z1.h, z5.h\n"
+      "fmla z16.h, p0/M, z2.h, z5.h\n"
+      "fmla z17.h, p0/M, z0.h, z6.h\n"
+      "ld1rh { z5.h }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z18.h, p0/M, z1.h, z6.h\n"
+      "fmla z19.h, p0/M, z2.h, z6.h\n"
+      "ld1rh { z6.h }, p0/Z, [%x[Apanel], #14]\n"
+      "addvl x20, x20, #3\n"
+      "fmla z20.h, p0/M, z0.h, z3.h\n"
+      "fmla z21.h, p0/M, z1.h, z3.h\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla z22.h, p0/M, z2.h, z3.h\n"
+      "fmla z23.h, p0/M, z0.h, z4.h\n"
+      "fmla z24.h, p0/M, z1.h, z4.h\n"
+      "fmla z25.h, p0/M, z2.h, z4.h\n"
+      "fmla z26.h, p0/M, z0.h, z5.h\n"
+      "fmla z27.h, p0/M, z1.h, z5.h\n"
+      "fmla z28.h, p0/M, z2.h, z5.h\n"
+      "fmla z29.h, p0/M, z0.h, z6.h\n"
+      "fmla z30.h, p0/M, z1.h, z6.h\n"
+      "fmla z31.h, p0/M, z2.h, z6.h\n"
+      "5:"  // multiply loop done
+      "st1h { z8.h }, p0, [%x[Cpanel]]\n"
+      "subs x22, x22, #0x1\n"
+      "st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1h { z12.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1h { z13.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1h { z14.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1h { z15.h }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1h { z16.h }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1h { z17.h }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1h { z18.h }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1h { z19.h }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1h { z20.h }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1h { z21.h }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1h { z22.h }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1h { z23.h }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1h { z24.h }, p0, [%x[Cpanel]]\n"
+      "st1h { z25.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1h { z26.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1h { z27.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1h { z28.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1h { z29.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1h { z30.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1h { z31.h }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
index 0f1937acc5..f8e4b89b95 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,310 +10,232 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #ifdef ARM_COMPUTE_ENABLE_SVE
 
-
-#include "../../asmlib.hpp"
+#include <cstddef>
 
 namespace arm_gemm {
 
-void sve_interleaved_fp16_mla_8x3VL(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
-    const __fp16 *a_ptr = Apanel;
-    __fp16 *c_ptr = Cpanel;
-
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+void sve_interleaved_fp16_mla_8x3VL(
+    const __fp16 *Apanel, const __fp16 *Bpanel,
+    __fp16 *Cpanel, int ablocks, int bblocks, int K) {
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const __fp16 *a_ptr0 = a_ptr;
-        const __fp16 *b_ptr = Bpanel;
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const __fp16 *Bpanel = {};
+    } ka;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
+    ka.bblocks = bblocks;
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
 
-            __asm __volatile (
-                "mov z8.h, #0\n"
-                "ptrue p0.h\n"
-                "mov z9.h, #0\n"
-                "mov z10.h, #0\n"
-                "mov z11.h, #0\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                "mov z12.h, #0\n"
-                "ld1h z2.h, p0/z, [%[b_ptr]]\n"
-                "mov z13.h, #0\n"
-                "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z14.h, #0\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "mov z15.h, #0\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                "mov z16.h, #0\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                "mov z17.h, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                "mov z18.h, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "mov z19.h, #0\n"
-                "mov z20.h, #0\n"
-                "mov z21.h, #0\n"
-                "mov z22.h, #0\n"
-                "mov z23.h, #0\n"
-                "mov z24.h, #0\n"
-                "mov z25.h, #0\n"
-                "mov z26.h, #0\n"
-                "mov z27.h, #0\n"
-                "mov z28.h, #0\n"
-                "mov z29.h, #0\n"
-                "mov z30.h, #0\n"
-                "mov z31.h, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                "fmla z8.h, z2.h, z0.h[0]\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z9.h, z2.h, z0.h[1]\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x10]\n"
-                "fmla z10.h, z2.h, z0.h[2]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                "fmla z11.h, z2.h, z0.h[3]\n"
-                "fmla z12.h, z2.h, z0.h[4]\n"
-                "fmla z13.h, z2.h, z0.h[5]\n"
-                "fmla z14.h, z2.h, z0.h[6]\n"
-                "fmla z15.h, z2.h, z0.h[7]\n"
-                "ld1h z2.h, p0/z, [%[b_ptr]]\n"
-                "fmla z16.h, z3.h, z0.h[0]\n"
-                "fmla z17.h, z3.h, z0.h[1]\n"
-                "fmla z18.h, z3.h, z0.h[2]\n"
-                "fmla z19.h, z3.h, z0.h[3]\n"
-                "fmla z20.h, z3.h, z0.h[4]\n"
-                "fmla z21.h, z3.h, z0.h[5]\n"
-                "fmla z22.h, z3.h, z0.h[6]\n"
-                "fmla z23.h, z3.h, z0.h[7]\n"
-                "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "fmla z24.h, z4.h, z0.h[0]\n"
-                "fmla z25.h, z4.h, z0.h[1]\n"
-                "fmla z26.h, z4.h, z0.h[2]\n"
-                "fmla z27.h, z4.h, z0.h[3]\n"
-                "fmla z28.h, z4.h, z0.h[4]\n"
-                "fmla z29.h, z4.h, z0.h[5]\n"
-                "fmla z30.h, z4.h, z0.h[6]\n"
-                "fmla z31.h, z4.h, z0.h[7]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "fmla z8.h, z5.h, z1.h[0]\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                "fmla z9.h, z5.h, z1.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                "fmla z10.h, z5.h, z1.h[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "fmla z11.h, z5.h, z1.h[3]\n"
-                "fmla z12.h, z5.h, z1.h[4]\n"
-                "fmla z13.h, z5.h, z1.h[5]\n"
-                "fmla z14.h, z5.h, z1.h[6]\n"
-                "fmla z15.h, z5.h, z1.h[7]\n"
-                "ld1h z5.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "fmla z16.h, z6.h, z1.h[0]\n"
-                "fmla z17.h, z6.h, z1.h[1]\n"
-                "fmla z18.h, z6.h, z1.h[2]\n"
-                "fmla z19.h, z6.h, z1.h[3]\n"
-                "fmla z20.h, z6.h, z1.h[4]\n"
-                "fmla z21.h, z6.h, z1.h[5]\n"
-                "fmla z22.h, z6.h, z1.h[6]\n"
-                "fmla z23.h, z6.h, z1.h[7]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "fmla z24.h, z7.h, z1.h[0]\n"
-                "fmla z25.h, z7.h, z1.h[1]\n"
-                "fmla z26.h, z7.h, z1.h[2]\n"
-                "fmla z27.h, z7.h, z1.h[3]\n"
-                "fmla z28.h, z7.h, z1.h[4]\n"
-                "fmla z29.h, z7.h, z1.h[5]\n"
-                "fmla z30.h, z7.h, z1.h[6]\n"
-                "fmla z31.h, z7.h, z1.h[7]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                "fmla z8.h, z2.h, z0.h[0]\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z9.h, z2.h, z0.h[1]\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x10]\n"
-                "fmla z10.h, z2.h, z0.h[2]\n"
-                "fmla z11.h, z2.h, z0.h[3]\n"
-                "fmla z12.h, z2.h, z0.h[4]\n"
-                "fmla z13.h, z2.h, z0.h[5]\n"
-                "fmla z14.h, z2.h, z0.h[6]\n"
-                "fmla z15.h, z2.h, z0.h[7]\n"
-                "ld1h z2.h, p0/z, [%[b_ptr]]\n"
-                "fmla z16.h, z3.h, z0.h[0]\n"
-                "fmla z17.h, z3.h, z0.h[1]\n"
-                "fmla z18.h, z3.h, z0.h[2]\n"
-                "fmla z19.h, z3.h, z0.h[3]\n"
-                "fmla z20.h, z3.h, z0.h[4]\n"
-                "fmla z21.h, z3.h, z0.h[5]\n"
-                "fmla z22.h, z3.h, z0.h[6]\n"
-                "fmla z23.h, z3.h, z0.h[7]\n"
-                "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "fmla z24.h, z4.h, z0.h[0]\n"
-                "fmla z25.h, z4.h, z0.h[1]\n"
-                "fmla z26.h, z4.h, z0.h[2]\n"
-                "fmla z27.h, z4.h, z0.h[3]\n"
-                "fmla z28.h, z4.h, z0.h[4]\n"
-                "fmla z29.h, z4.h, z0.h[5]\n"
-                "fmla z30.h, z4.h, z0.h[6]\n"
-                "fmla z31.h, z4.h, z0.h[7]\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "fmla z8.h, z5.h, z1.h[0]\n"
-                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                "fmla z9.h, z5.h, z1.h[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x10\n"
-                "fmla z10.h, z5.h, z1.h[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "fmla z11.h, z5.h, z1.h[3]\n"
-                "fmla z12.h, z5.h, z1.h[4]\n"
-                "fmla z13.h, z5.h, z1.h[5]\n"
-                "fmla z14.h, z5.h, z1.h[6]\n"
-                "fmla z15.h, z5.h, z1.h[7]\n"
-                "fmla z16.h, z6.h, z1.h[0]\n"
-                "fmla z17.h, z6.h, z1.h[1]\n"
-                "fmla z18.h, z6.h, z1.h[2]\n"
-                "fmla z19.h, z6.h, z1.h[3]\n"
-                "fmla z20.h, z6.h, z1.h[4]\n"
-                "fmla z21.h, z6.h, z1.h[5]\n"
-                "fmla z22.h, z6.h, z1.h[6]\n"
-                "fmla z23.h, z6.h, z1.h[7]\n"
-                "fmla z24.h, z7.h, z1.h[0]\n"
-                "fmla z25.h, z7.h, z1.h[1]\n"
-                "fmla z26.h, z7.h, z1.h[2]\n"
-                "fmla z27.h, z7.h, z1.h[3]\n"
-                "fmla z28.h, z7.h, z1.h[4]\n"
-                "fmla z29.h, z7.h, z1.h[5]\n"
-                "fmla z30.h, z7.h, z1.h[6]\n"
-                "fmla z31.h, z7.h, z1.h[7]\n"
-                "fmla z8.h, z2.h, z0.h[0]\n"
-                "fmla z9.h, z2.h, z0.h[1]\n"
-                "fmla z10.h, z2.h, z0.h[2]\n"
-                "fmla z11.h, z2.h, z0.h[3]\n"
-                "st1h z8.h, p0, [%[c_ptr]]\n"
-                "fmla z12.h, z2.h, z0.h[4]\n"
-                "fmla z13.h, z2.h, z0.h[5]\n"
-                "fmla z14.h, z2.h, z0.h[6]\n"
-                "fmla z15.h, z2.h, z0.h[7]\n"
-                "fmla z16.h, z3.h, z0.h[0]\n"
-                "fmla z17.h, z3.h, z0.h[1]\n"
-                "fmla z18.h, z3.h, z0.h[2]\n"
-                "fmla z19.h, z3.h, z0.h[3]\n"
-                "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
-                "fmla z20.h, z3.h, z0.h[4]\n"
-                "fmla z21.h, z3.h, z0.h[5]\n"
-                "fmla z22.h, z3.h, z0.h[6]\n"
-                "fmla z23.h, z3.h, z0.h[7]\n"
-                "fmla z24.h, z4.h, z0.h[0]\n"
-                "fmla z25.h, z4.h, z0.h[1]\n"
-                "fmla z26.h, z4.h, z0.h[2]\n"
-                "fmla z27.h, z4.h, z0.h[3]\n"
-                "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
-                "fmla z28.h, z4.h, z0.h[4]\n"
-                "fmla z29.h, z4.h, z0.h[5]\n"
-                "fmla z30.h, z4.h, z0.h[6]\n"
-                "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
-                "fmla z31.h, z4.h, z0.h[7]\n"
-                "b 4f\n"
-                "3:\n"
-                "fmla z8.h, z2.h, z0.h[0]\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z9.h, z2.h, z0.h[1]\n"
-                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x10]\n"
-                "fmla z10.h, z2.h, z0.h[2]\n"
-                "fmla z11.h, z2.h, z0.h[3]\n"
-                "fmla z12.h, z2.h, z0.h[4]\n"
-                "fmla z13.h, z2.h, z0.h[5]\n"
-                "fmla z14.h, z2.h, z0.h[6]\n"
-                "fmla z15.h, z2.h, z0.h[7]\n"
-                "fmla z16.h, z3.h, z0.h[0]\n"
-                "fmla z17.h, z3.h, z0.h[1]\n"
-                "fmla z18.h, z3.h, z0.h[2]\n"
-                "fmla z19.h, z3.h, z0.h[3]\n"
-                "fmla z20.h, z3.h, z0.h[4]\n"
-                "fmla z21.h, z3.h, z0.h[5]\n"
-                "fmla z22.h, z3.h, z0.h[6]\n"
-                "fmla z23.h, z3.h, z0.h[7]\n"
-                "fmla z24.h, z4.h, z0.h[0]\n"
-                "fmla z25.h, z4.h, z0.h[1]\n"
-                "fmla z26.h, z4.h, z0.h[2]\n"
-                "fmla z27.h, z4.h, z0.h[3]\n"
-                "fmla z28.h, z4.h, z0.h[4]\n"
-                "fmla z29.h, z4.h, z0.h[5]\n"
-                "fmla z30.h, z4.h, z0.h[6]\n"
-                "fmla z31.h, z4.h, z0.h[7]\n"
-                "fmla z8.h, z5.h, z1.h[0]\n"
-                "fmla z9.h, z5.h, z1.h[1]\n"
-                "fmla z10.h, z5.h, z1.h[2]\n"
-                "fmla z11.h, z5.h, z1.h[3]\n"
-                "st1h z8.h, p0, [%[c_ptr]]\n"
-                "fmla z12.h, z5.h, z1.h[4]\n"
-                "fmla z13.h, z5.h, z1.h[5]\n"
-                "fmla z14.h, z5.h, z1.h[6]\n"
-                "fmla z15.h, z5.h, z1.h[7]\n"
-                "fmla z16.h, z6.h, z1.h[0]\n"
-                "fmla z17.h, z6.h, z1.h[1]\n"
-                "fmla z18.h, z6.h, z1.h[2]\n"
-                "fmla z19.h, z6.h, z1.h[3]\n"
-                "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
-                "fmla z20.h, z6.h, z1.h[4]\n"
-                "fmla z21.h, z6.h, z1.h[5]\n"
-                "fmla z22.h, z6.h, z1.h[6]\n"
-                "fmla z23.h, z6.h, z1.h[7]\n"
-                "fmla z24.h, z7.h, z1.h[0]\n"
-                "fmla z25.h, z7.h, z1.h[1]\n"
-                "fmla z26.h, z7.h, z1.h[2]\n"
-                "fmla z27.h, z7.h, z1.h[3]\n"
-                "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
-                "fmla z28.h, z7.h, z1.h[4]\n"
-                "fmla z29.h, z7.h, z1.h[5]\n"
-                "fmla z30.h, z7.h, z1.h[6]\n"
-                "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
-                "fmla z31.h, z7.h, z1.h[7]\n"
-                "4:\n"
-                "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "st1h z26.h, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "st1h z11.h, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "st1h z19.h, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "st1h z27.h, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "st1h z12.h, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "st1h z20.h, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "st1h z28.h, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "st1h z13.h, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "st1h z21.h, p0, [%[c_ptr]]\n"
-                "st1h z29.h, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1h z14.h, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1h z22.h, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1h z30.h, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1h z15.h, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1h z23.h, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1h z31.h, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "ld1h { z2.h }, p0/Z, [x20]\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov %x[Apanel], x21\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "cmp x19, #0x2\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "fmla z8.h, z2.h, z0.h[0]\n"
+      "fmla z11.h, z2.h, z0.h[1]\n"
+      "ld1h { z3.h }, p0/Z, [x20, #1, MUL VL]\n"
+      "fmla z14.h, z2.h, z0.h[2]\n"
+      "fmla z17.h, z2.h, z0.h[3]\n"
+      "ld1h { z4.h }, p0/Z, [x20, #2, MUL VL]\n"
+      "fmla z20.h, z2.h, z0.h[4]\n"
+      "fmla z23.h, z2.h, z0.h[5]\n"
+      "ld1rqh { z1.h }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z26.h, z2.h, z0.h[6]\n"
+      "fmla z29.h, z2.h, z0.h[7]\n"
+      "ld1h { z5.h }, p0/Z, [x20, #3, MUL VL]\n"
+      "fmla z9.h, z3.h, z0.h[0]\n"
+      "fmla z12.h, z3.h, z0.h[1]\n"
+      "ld1h { z6.h }, p0/Z, [x20, #4, MUL VL]\n"
+      "fmla z15.h, z3.h, z0.h[2]\n"
+      "fmla z18.h, z3.h, z0.h[3]\n"
+      "ld1h { z7.h }, p0/Z, [x20, #5, MUL VL]\n"
+      "fmla z21.h, z3.h, z0.h[4]\n"
+      "fmla z24.h, z3.h, z0.h[5]\n"
+      "sub x19, x19, #0x2\n"
+      "fmla z27.h, z3.h, z0.h[6]\n"
+      "fmla z30.h, z3.h, z0.h[7]\n"
+      "cmp x19, #0x2\n"
+      "fmla z10.h, z4.h, z0.h[0]\n"
+      "fmla z13.h, z4.h, z0.h[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z16.h, z4.h, z0.h[2]\n"
+      "fmla z19.h, z4.h, z0.h[3]\n"
+      "addvl x20, x20, #6\n"
+      "fmla z22.h, z4.h, z0.h[4]\n"
+      "fmla z25.h, z4.h, z0.h[5]\n"
+      "fmla z28.h, z4.h, z0.h[6]\n"
+      "fmla z31.h, z4.h, z0.h[7]\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      "fmla z8.h, z5.h, z1.h[0]\n"
+      "fmla z11.h, z5.h, z1.h[1]\n"
+      "ld1h { z2.h }, p0/Z, [x20]\n"
+      "fmla z14.h, z5.h, z1.h[2]\n"
+      "fmla z17.h, z5.h, z1.h[3]\n"
+      "fmla z20.h, z5.h, z1.h[4]\n"
+      "fmla z23.h, z5.h, z1.h[5]\n"
+      "fmla z26.h, z5.h, z1.h[6]\n"
+      "fmla z29.h, z5.h, z1.h[7]\n"
+      "fmla z9.h, z6.h, z1.h[0]\n"
+      "fmla z12.h, z6.h, z1.h[1]\n"
+      "fmla z15.h, z6.h, z1.h[2]\n"
+      "fmla z18.h, z6.h, z1.h[3]\n"
+      "fmla z21.h, z6.h, z1.h[4]\n"
+      "fmla z24.h, z6.h, z1.h[5]\n"
+      "fmla z27.h, z6.h, z1.h[6]\n"
+      "fmla z30.h, z6.h, z1.h[7]\n"
+      "fmla z10.h, z7.h, z1.h[0]\n"
+      "fmla z13.h, z7.h, z1.h[1]\n"
+      "fmla z16.h, z7.h, z1.h[2]\n"
+      "fmla z19.h, z7.h, z1.h[3]\n"
+      "fmla z22.h, z7.h, z1.h[4]\n"
+      "fmla z25.h, z7.h, z1.h[5]\n"
+      "fmla z28.h, z7.h, z1.h[6]\n"
+      "fmla z31.h, z7.h, z1.h[7]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "fmla z8.h, z2.h, z0.h[0]\n"
+      "fmla z11.h, z2.h, z0.h[1]\n"
+      "ld1h { z3.h }, p0/Z, [x20, #1, MUL VL]\n"
+      "fmla z14.h, z2.h, z0.h[2]\n"
+      "fmla z17.h, z2.h, z0.h[3]\n"
+      "ld1h { z4.h }, p0/Z, [x20, #2, MUL VL]\n"
+      "fmla z20.h, z2.h, z0.h[4]\n"
+      "fmla z23.h, z2.h, z0.h[5]\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "fmla z26.h, z2.h, z0.h[6]\n"
+      "fmla z29.h, z2.h, z0.h[7]\n"
+      "addvl x20, x20, #3\n"
+      "fmla z9.h, z3.h, z0.h[0]\n"
+      "fmla z12.h, z3.h, z0.h[1]\n"
+      "fmla z15.h, z3.h, z0.h[2]\n"
+      "fmla z18.h, z3.h, z0.h[3]\n"
+      "fmla z21.h, z3.h, z0.h[4]\n"
+      "fmla z24.h, z3.h, z0.h[5]\n"
+      "fmla z27.h, z3.h, z0.h[6]\n"
+      "fmla z30.h, z3.h, z0.h[7]\n"
+      "fmla z10.h, z4.h, z0.h[0]\n"
+      "fmla z13.h, z4.h, z0.h[1]\n"
+      "fmla z16.h, z4.h, z0.h[2]\n"
+      "fmla z19.h, z4.h, z0.h[3]\n"
+      "fmla z22.h, z4.h, z0.h[4]\n"
+      "fmla z25.h, z4.h, z0.h[5]\n"
+      "fmla z28.h, z4.h, z0.h[6]\n"
+      "fmla z31.h, z4.h, z0.h[7]\n"
+      "cbz x19, 5f\n"
+      "ld1rqh { z0.h }, p0/Z, [%x[Apanel]]\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      "ld1h { z5.h }, p0/Z, [x20]\n"
+      "ld1h { z6.h }, p0/Z, [x20, #1, MUL VL]\n"
+      "ld1h { z7.h }, p0/Z, [x20, #2, MUL VL]\n"
+      "addvl x20, x20, #3\n"
+      "fmla z8.h, z5.h, z0.h[0]\n"
+      "fmla z11.h, z5.h, z0.h[1]\n"
+      "fmla z14.h, z5.h, z0.h[2]\n"
+      "fmla z17.h, z5.h, z0.h[3]\n"
+      "fmla z20.h, z5.h, z0.h[4]\n"
+      "fmla z23.h, z5.h, z0.h[5]\n"
+      "fmla z26.h, z5.h, z0.h[6]\n"
+      "fmla z29.h, z5.h, z0.h[7]\n"
+      "fmla z9.h, z6.h, z0.h[0]\n"
+      "fmla z12.h, z6.h, z0.h[1]\n"
+      "fmla z15.h, z6.h, z0.h[2]\n"
+      "fmla z18.h, z6.h, z0.h[3]\n"
+      "fmla z21.h, z6.h, z0.h[4]\n"
+      "fmla z24.h, z6.h, z0.h[5]\n"
+      "fmla z27.h, z6.h, z0.h[6]\n"
+      "fmla z30.h, z6.h, z0.h[7]\n"
+      "fmla z10.h, z7.h, z0.h[0]\n"
+      "fmla z13.h, z7.h, z0.h[1]\n"
+      "fmla z16.h, z7.h, z0.h[2]\n"
+      "fmla z19.h, z7.h, z0.h[3]\n"
+      "fmla z22.h, z7.h, z0.h[4]\n"
+      "fmla z25.h, z7.h, z0.h[5]\n"
+      "fmla z28.h, z7.h, z0.h[6]\n"
+      "fmla z31.h, z7.h, z0.h[7]\n"
+      "5:"  // multiply loop done
+      "st1h { z8.h }, p0, [%x[Cpanel]]\n"
+      "subs x22, x22, #0x1\n"
+      "st1h { z9.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1h { z10.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1h { z11.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1h { z12.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1h { z13.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1h { z14.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1h { z15.h }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1h { z16.h }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1h { z17.h }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1h { z18.h }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1h { z19.h }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1h { z20.h }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1h { z21.h }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1h { z22.h }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1h { z23.h }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1h { z24.h }, p0, [%x[Cpanel]]\n"
+      "st1h { z25.h }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1h { z26.h }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1h { z27.h }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1h { z28.h }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1h { z29.h }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1h { z30.h }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1h { z31.h }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
index f4bb809fe8..9bf8253fc8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,63 +10,95 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
 
 #ifdef ARM_COMPUTE_ENABLE_SVE
-
-
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const float *, const float *, \
+    float *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_fp32_mla_8x3VL(const float *, const float *, float *, int, int, int);
+void sve_interleaved_fp32_mla_8x3VL( ARGLIST );
+void sve_interleaved_fp32_mla_8x3VL_a64fx( ARGLIST );
 
-class cls_sve_interleaved_fp32_mla_8x3VL {
+class cls_sve_interleaved_fp32_mla_8x3VL
+{
 public:
     typedef float operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
     static unsigned int out_width()
     {
         return get_vector_length<float>() * 3;
     }
 
-    static unsigned int out_height()
+    static unsigned int stripe_width()
     {
-        return 8;
+        return get_vector_length<float>();
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 1;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+    StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_fp32_mla_8x3VL;
+        if (std::is_same<T, float>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 7.2307, 3.876, 2.932 };
+            }
+        }
 
-    cls_sve_interleaved_fp32_mla_8x3VL(const CPUInfo *)
-    {
+        return { 1.0 };
+    }
 
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_fp32_mla_8x3VL;
+    cls_sve_interleaved_fp32_mla_8x3VL(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model()) {
+            default:
+                break;
+            case CPUModel::A64FX:
+                kernel=sve_interleaved_fp32_mla_8x3VL_a64fx;
+                break;
+        }
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp
new file mode 100644
index 0000000000..6defe0e223
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/a64fx.cpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <cstddef>
+
+namespace arm_gemm {
+
+void sve_interleaved_fp32_mla_8x3VL_a64fx(
+    const float *Apanel, const float *Bpanel,
+    float *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const float *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "ld1w { z0.s }, p0/Z, [x20]\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "ld1w { z1.s }, p0/Z, [x20, #1, MUL VL]\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "ld1w { z2.s }, p0/Z, [x20, #2, MUL VL]\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "fmla z8.s, p0/M, z0.s, z3.s\n"
+      "fmla z9.s, p0/M, z1.s, z3.s\n"
+      "sub x19, x19, #0x2\n"
+      "fmla z10.s, p0/M, z2.s, z3.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z11.s, p0/M, z0.s, z4.s\n"
+      "fmla z12.s, p0/M, z1.s, z4.s\n"
+      "fmla z13.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z14.s, p0/M, z0.s, z5.s\n"
+      "fmla z15.s, p0/M, z1.s, z5.s\n"
+      "cmp x19, #0x2\n"
+      "fmla z16.s, p0/M, z2.s, z5.s\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z17.s, p0/M, z0.s, z6.s\n"
+      "fmla z18.s, p0/M, z1.s, z6.s\n"
+      "fmla z19.s, p0/M, z2.s, z6.s\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z20.s, p0/M, z0.s, z3.s\n"
+      "fmla z21.s, p0/M, z1.s, z3.s\n"
+      "fmla z22.s, p0/M, z2.s, z3.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
+      "fmla z23.s, p0/M, z0.s, z4.s\n"
+      "fmla z24.s, p0/M, z1.s, z4.s\n"
+      "fmla z25.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
+      "fmla z26.s, p0/M, z0.s, z5.s\n"
+      "fmla z27.s, p0/M, z1.s, z5.s\n"
+      "fmla z28.s, p0/M, z2.s, z5.s\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n"
+      "fmla z29.s, p0/M, z0.s, z6.s\n"
+      "ld1w { z0.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "fmla z30.s, p0/M, z1.s, z6.s\n"
+      "fmla z31.s, p0/M, z2.s, z6.s\n"
+      "ld1w { z1.s }, p0/Z, [x20, #4, MUL VL]\n"
+      "ld1w { z2.s }, p0/Z, [x20, #5, MUL VL]\n"
+      "fmla z8.s, p0/M, z0.s, z3.s\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n"
+      "fmla z9.s, p0/M, z1.s, z3.s\n"
+      "fmla z10.s, p0/M, z2.s, z3.s\n"
+      "fmla z11.s, p0/M, z0.s, z4.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
+      "fmla z12.s, p0/M, z1.s, z4.s\n"
+      "fmla z13.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
+      "fmla z14.s, p0/M, z0.s, z5.s\n"
+      "fmla z15.s, p0/M, z1.s, z5.s\n"
+      "addvl x20, x20, #6\n"
+      "fmla z16.s, p0/M, z2.s, z5.s\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n"
+      "fmla z17.s, p0/M, z0.s, z6.s\n"
+      "fmla z18.s, p0/M, z1.s, z6.s\n"
+      "fmla z19.s, p0/M, z2.s, z6.s\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "fmla z20.s, p0/M, z0.s, z3.s\n"
+      "fmla z21.s, p0/M, z1.s, z3.s\n"
+      "fmla z22.s, p0/M, z2.s, z3.s\n"
+      "fmla z23.s, p0/M, z0.s, z4.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "fmla z24.s, p0/M, z1.s, z4.s\n"
+      "fmla z25.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "fmla z26.s, p0/M, z0.s, z5.s\n"
+      "fmla z27.s, p0/M, z1.s, z5.s\n"
+      "fmla z28.s, p0/M, z2.s, z5.s\n"
+      "fmla z29.s, p0/M, z0.s, z6.s\n"
+      "ld1w { z0.s }, p0/Z, [x20]\n"
+      "fmla z30.s, p0/M, z1.s, z6.s\n"
+      "fmla z31.s, p0/M, z2.s, z6.s\n"
+      "ld1w { z1.s }, p0/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z2.s }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "fmla z8.s, p0/M, z0.s, z3.s\n"
+      "fmla z9.s, p0/M, z1.s, z3.s\n"
+      "addvl x20, x20, #3\n"
+      "fmla z10.s, p0/M, z2.s, z3.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z11.s, p0/M, z0.s, z4.s\n"
+      "fmla z12.s, p0/M, z1.s, z4.s\n"
+      "fmla z13.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z14.s, p0/M, z0.s, z5.s\n"
+      "fmla z15.s, p0/M, z1.s, z5.s\n"
+      "fmla z16.s, p0/M, z2.s, z5.s\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z17.s, p0/M, z0.s, z6.s\n"
+      "fmla z18.s, p0/M, z1.s, z6.s\n"
+      "fmla z19.s, p0/M, z2.s, z6.s\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
+      "fmla z20.s, p0/M, z0.s, z3.s\n"
+      "fmla z21.s, p0/M, z1.s, z3.s\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z22.s, p0/M, z2.s, z3.s\n"
+      "fmla z23.s, p0/M, z0.s, z4.s\n"
+      "fmla z24.s, p0/M, z1.s, z4.s\n"
+      "fmla z25.s, p0/M, z2.s, z4.s\n"
+      "fmla z26.s, p0/M, z0.s, z5.s\n"
+      "fmla z27.s, p0/M, z1.s, z5.s\n"
+      "fmla z28.s, p0/M, z2.s, z5.s\n"
+      "fmla z29.s, p0/M, z0.s, z6.s\n"
+      "fmla z30.s, p0/M, z1.s, z6.s\n"
+      "fmla z31.s, p0/M, z2.s, z6.s\n"
+      "cbz x19, 5f\n"
+      "ld1w { z0.s }, p0/Z, [x20]\n"
+      "ld1w { z1.s }, p0/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z2.s }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "fmla z8.s, p0/M, z0.s, z3.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "fmla z9.s, p0/M, z1.s, z3.s\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "fmla z10.s, p0/M, z2.s, z3.s\n"
+      "fmla z11.s, p0/M, z0.s, z4.s\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z12.s, p0/M, z1.s, z4.s\n"
+      "fmla z13.s, p0/M, z2.s, z4.s\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "fmla z14.s, p0/M, z0.s, z5.s\n"
+      "fmla z15.s, p0/M, z1.s, z5.s\n"
+      "fmla z16.s, p0/M, z2.s, z5.s\n"
+      "fmla z17.s, p0/M, z0.s, z6.s\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "fmla z18.s, p0/M, z1.s, z6.s\n"
+      "fmla z19.s, p0/M, z2.s, z6.s\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
+      "addvl x20, x20, #3\n"
+      "fmla z20.s, p0/M, z0.s, z3.s\n"
+      "fmla z21.s, p0/M, z1.s, z3.s\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z22.s, p0/M, z2.s, z3.s\n"
+      "fmla z23.s, p0/M, z0.s, z4.s\n"
+      "fmla z24.s, p0/M, z1.s, z4.s\n"
+      "fmla z25.s, p0/M, z2.s, z4.s\n"
+      "fmla z26.s, p0/M, z0.s, z5.s\n"
+      "fmla z27.s, p0/M, z1.s, z5.s\n"
+      "fmla z28.s, p0/M, z2.s, z5.s\n"
+      "fmla z29.s, p0/M, z0.s, z6.s\n"
+      "fmla z30.s, p0/M, z1.s, z6.s\n"
+      "fmla z31.s, p0/M, z2.s, z6.s\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x22, x22, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
index 10feaa130b..e02db6ec48 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,319 +10,236 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #ifdef ARM_COMPUTE_ENABLE_SVE
 
-
-#include "../../asmlib.hpp"
+#include <cstddef>
 
 namespace arm_gemm {
 
-void sve_interleaved_fp32_mla_8x3VL(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
-    const float *a_ptr = Apanel;
-    float *c_ptr = Cpanel;
-
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+void sve_interleaved_fp32_mla_8x3VL(
+    const float *Apanel, const float *Bpanel,
+    float *Cpanel, int ablocks, int bblocks, int K) {
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const float *a_ptr0 = a_ptr;
-        const float *b_ptr = Bpanel;
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const float *Bpanel = {};
+    } ka;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
+    ka.bblocks = bblocks;
+    ka.K = (K/1) - 1;
+    ka.Bpanel = Bpanel;
 
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.s\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z17.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "mov z18.s, #0\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                "fmla z8.s, z4.s, z0.s[0]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z9.s, z4.s, z0.s[1]\n"
-                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
-                "fmla z10.s, z4.s, z0.s[2]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                "fmla z11.s, z4.s, z0.s[3]\n"
-                "fmla z20.s, z4.s, z1.s[0]\n"
-                "fmla z21.s, z4.s, z1.s[1]\n"
-                "fmla z22.s, z4.s, z1.s[2]\n"
-                "fmla z23.s, z4.s, z1.s[3]\n"
-                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
-                "fmla z12.s, z5.s, z0.s[0]\n"
-                "fmla z13.s, z5.s, z0.s[1]\n"
-                "fmla z14.s, z5.s, z0.s[2]\n"
-                "fmla z15.s, z5.s, z0.s[3]\n"
-                "fmla z24.s, z5.s, z1.s[0]\n"
-                "fmla z25.s, z5.s, z1.s[1]\n"
-                "fmla z26.s, z5.s, z1.s[2]\n"
-                "fmla z27.s, z5.s, z1.s[3]\n"
-                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "fmla z16.s, z6.s, z0.s[0]\n"
-                "fmla z17.s, z6.s, z0.s[1]\n"
-                "fmla z18.s, z6.s, z0.s[2]\n"
-                "fmla z19.s, z6.s, z0.s[3]\n"
-                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
-                "fmla z28.s, z6.s, z1.s[0]\n"
-                "fmla z29.s, z6.s, z1.s[1]\n"
-                "fmla z30.s, z6.s, z1.s[2]\n"
-                "fmla z31.s, z6.s, z1.s[3]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "fmla z8.s, z4.s, z2.s[0]\n"
-                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
-                "fmla z9.s, z4.s, z2.s[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "fmla z10.s, z4.s, z2.s[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "fmla z11.s, z4.s, z2.s[3]\n"
-                "fmla z20.s, z4.s, z3.s[0]\n"
-                "fmla z21.s, z4.s, z3.s[1]\n"
-                "fmla z22.s, z4.s, z3.s[2]\n"
-                "fmla z23.s, z4.s, z3.s[3]\n"
-                "ld1w z4.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "fmla z12.s, z5.s, z2.s[0]\n"
-                "fmla z13.s, z5.s, z2.s[1]\n"
-                "fmla z14.s, z5.s, z2.s[2]\n"
-                "fmla z15.s, z5.s, z2.s[3]\n"
-                "fmla z24.s, z5.s, z3.s[0]\n"
-                "fmla z25.s, z5.s, z3.s[1]\n"
-                "fmla z26.s, z5.s, z3.s[2]\n"
-                "fmla z27.s, z5.s, z3.s[3]\n"
-                "ld1w z5.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "fmla z16.s, z6.s, z2.s[0]\n"
-                "fmla z17.s, z6.s, z2.s[1]\n"
-                "fmla z18.s, z6.s, z2.s[2]\n"
-                "fmla z19.s, z6.s, z2.s[3]\n"
-                "ld1rqw z2.s, p0/z, [%[a_ptr], #-0x20]\n"
-                "fmla z28.s, z6.s, z3.s[0]\n"
-                "fmla z29.s, z6.s, z3.s[1]\n"
-                "fmla z30.s, z6.s, z3.s[2]\n"
-                "fmla z31.s, z6.s, z3.s[3]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                "fmla z8.s, z4.s, z0.s[0]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z9.s, z4.s, z0.s[1]\n"
-                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
-                "fmla z10.s, z4.s, z0.s[2]\n"
-                "fmla z11.s, z4.s, z0.s[3]\n"
-                "fmla z20.s, z4.s, z1.s[0]\n"
-                "fmla z21.s, z4.s, z1.s[1]\n"
-                "fmla z22.s, z4.s, z1.s[2]\n"
-                "fmla z23.s, z4.s, z1.s[3]\n"
-                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
-                "fmla z12.s, z5.s, z0.s[0]\n"
-                "fmla z13.s, z5.s, z0.s[1]\n"
-                "fmla z14.s, z5.s, z0.s[2]\n"
-                "fmla z15.s, z5.s, z0.s[3]\n"
-                "fmla z24.s, z5.s, z1.s[0]\n"
-                "fmla z25.s, z5.s, z1.s[1]\n"
-                "fmla z26.s, z5.s, z1.s[2]\n"
-                "fmla z27.s, z5.s, z1.s[3]\n"
-                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "fmla z16.s, z6.s, z0.s[0]\n"
-                "fmla z17.s, z6.s, z0.s[1]\n"
-                "fmla z18.s, z6.s, z0.s[2]\n"
-                "fmla z19.s, z6.s, z0.s[3]\n"
-                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
-                "fmla z28.s, z6.s, z1.s[0]\n"
-                "fmla z29.s, z6.s, z1.s[1]\n"
-                "fmla z30.s, z6.s, z1.s[2]\n"
-                "fmla z31.s, z6.s, z1.s[3]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "fmla z8.s, z4.s, z2.s[0]\n"
-                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
-                "fmla z9.s, z4.s, z2.s[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                "fmla z10.s, z4.s, z2.s[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "fmla z11.s, z4.s, z2.s[3]\n"
-                "fmla z20.s, z4.s, z3.s[0]\n"
-                "fmla z21.s, z4.s, z3.s[1]\n"
-                "fmla z22.s, z4.s, z3.s[2]\n"
-                "fmla z23.s, z4.s, z3.s[3]\n"
-                "ld1w z4.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "fmla z12.s, z5.s, z2.s[0]\n"
-                "fmla z13.s, z5.s, z2.s[1]\n"
-                "fmla z14.s, z5.s, z2.s[2]\n"
-                "fmla z15.s, z5.s, z2.s[3]\n"
-                "fmla z24.s, z5.s, z3.s[0]\n"
-                "fmla z25.s, z5.s, z3.s[1]\n"
-                "fmla z26.s, z5.s, z3.s[2]\n"
-                "fmla z27.s, z5.s, z3.s[3]\n"
-                "ld1w z5.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "fmla z16.s, z6.s, z2.s[0]\n"
-                "fmla z17.s, z6.s, z2.s[1]\n"
-                "fmla z18.s, z6.s, z2.s[2]\n"
-                "fmla z19.s, z6.s, z2.s[3]\n"
-                "fmla z28.s, z6.s, z3.s[0]\n"
-                "fmla z29.s, z6.s, z3.s[1]\n"
-                "fmla z30.s, z6.s, z3.s[2]\n"
-                "fmla z31.s, z6.s, z3.s[3]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z8.s, z4.s, z0.s[0]\n"
-                "fmla z9.s, z4.s, z0.s[1]\n"
-                "fmla z10.s, z4.s, z0.s[2]\n"
-                "fmla z11.s, z4.s, z0.s[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                "fmla z20.s, z4.s, z1.s[0]\n"
-                "fmla z21.s, z4.s, z1.s[1]\n"
-                "fmla z22.s, z4.s, z1.s[2]\n"
-                "fmla z23.s, z4.s, z1.s[3]\n"
-                "fmla z12.s, z5.s, z0.s[0]\n"
-                "fmla z13.s, z5.s, z0.s[1]\n"
-                "fmla z14.s, z5.s, z0.s[2]\n"
-                "fmla z15.s, z5.s, z0.s[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "fmla z24.s, z5.s, z1.s[0]\n"
-                "fmla z25.s, z5.s, z1.s[1]\n"
-                "fmla z26.s, z5.s, z1.s[2]\n"
-                "fmla z27.s, z5.s, z1.s[3]\n"
-                "fmla z16.s, z6.s, z0.s[0]\n"
-                "fmla z17.s, z6.s, z0.s[1]\n"
-                "fmla z18.s, z6.s, z0.s[2]\n"
-                "fmla z19.s, z6.s, z0.s[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "fmla z28.s, z6.s, z1.s[0]\n"
-                "fmla z29.s, z6.s, z1.s[1]\n"
-                "fmla z30.s, z6.s, z1.s[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "fmla z31.s, z6.s, z1.s[3]\n"
-                "b 4f\n"
-                "3:\n"
-                "fmla z8.s, z4.s, z0.s[0]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z9.s, z4.s, z0.s[1]\n"
-                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
-                "fmla z10.s, z4.s, z0.s[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "fmla z11.s, z4.s, z0.s[3]\n"
-                "fmla z20.s, z4.s, z1.s[0]\n"
-                "fmla z21.s, z4.s, z1.s[1]\n"
-                "fmla z22.s, z4.s, z1.s[2]\n"
-                "fmla z23.s, z4.s, z1.s[3]\n"
-                "ld1w z4.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "fmla z12.s, z5.s, z0.s[0]\n"
-                "fmla z13.s, z5.s, z0.s[1]\n"
-                "fmla z14.s, z5.s, z0.s[2]\n"
-                "fmla z15.s, z5.s, z0.s[3]\n"
-                "fmla z24.s, z5.s, z1.s[0]\n"
-                "fmla z25.s, z5.s, z1.s[1]\n"
-                "fmla z26.s, z5.s, z1.s[2]\n"
-                "fmla z27.s, z5.s, z1.s[3]\n"
-                "ld1w z5.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "fmla z16.s, z6.s, z0.s[0]\n"
-                "fmla z17.s, z6.s, z0.s[1]\n"
-                "fmla z18.s, z6.s, z0.s[2]\n"
-                "fmla z19.s, z6.s, z0.s[3]\n"
-                "fmla z28.s, z6.s, z1.s[0]\n"
-                "fmla z29.s, z6.s, z1.s[1]\n"
-                "fmla z30.s, z6.s, z1.s[2]\n"
-                "fmla z31.s, z6.s, z1.s[3]\n"
-                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "fmla z8.s, z4.s, z2.s[0]\n"
-                "fmla z9.s, z4.s, z2.s[1]\n"
-                "fmla z10.s, z4.s, z2.s[2]\n"
-                "fmla z11.s, z4.s, z2.s[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                "fmla z20.s, z4.s, z3.s[0]\n"
-                "fmla z21.s, z4.s, z3.s[1]\n"
-                "fmla z22.s, z4.s, z3.s[2]\n"
-                "fmla z23.s, z4.s, z3.s[3]\n"
-                "fmla z12.s, z5.s, z2.s[0]\n"
-                "fmla z13.s, z5.s, z2.s[1]\n"
-                "fmla z14.s, z5.s, z2.s[2]\n"
-                "fmla z15.s, z5.s, z2.s[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "fmla z24.s, z5.s, z3.s[0]\n"
-                "fmla z25.s, z5.s, z3.s[1]\n"
-                "fmla z26.s, z5.s, z3.s[2]\n"
-                "fmla z27.s, z5.s, z3.s[3]\n"
-                "fmla z16.s, z6.s, z2.s[0]\n"
-                "fmla z17.s, z6.s, z2.s[1]\n"
-                "fmla z18.s, z6.s, z2.s[2]\n"
-                "fmla z19.s, z6.s, z2.s[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "fmla z28.s, z6.s, z3.s[0]\n"
-                "fmla z29.s, z6.s, z3.s[1]\n"
-                "fmla z30.s, z6.s, z3.s[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "fmla z31.s, z6.s, z3.s[3]\n"
-                "4:\n"
-                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "st1w z25.s, p0, [%[c_ptr]]\n"
-                "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "mov z8.b, #0x0\n"
+      "mov z9.b, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov z10.b, #0x0\n"
+      "mov z11.b, #0x0\n"
+      "ld1w { z4.s }, p0/Z, [x20]\n"
+      "mov z12.b, #0x0\n"
+      "mov z13.b, #0x0\n"
+      "mov %x[Apanel], x21\n"
+      "mov z14.b, #0x0\n"
+      "mov z15.b, #0x0\n"
+      "cmp x19, #0x2\n"
+      "mov z16.b, #0x0\n"
+      "mov z17.b, #0x0\n"
+      "mov z18.b, #0x0\n"
+      "mov z19.b, #0x0\n"
+      "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
+      "mov z20.b, #0x0\n"
+      "mov z21.b, #0x0\n"
+      "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z22.b, #0x0\n"
+      "mov z23.b, #0x0\n"
+      "mov z24.b, #0x0\n"
+      "mov z25.b, #0x0\n"
+      "mov z26.b, #0x0\n"
+      "mov z27.b, #0x0\n"
+      "mov z28.b, #0x0\n"
+      "mov z29.b, #0x0\n"
+      "mov z30.b, #0x0\n"
+      "mov z31.b, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "fmla z8.s, z4.s, z0.s[0]\n"
+      "fmla z11.s, z4.s, z0.s[1]\n"
+      "ld1w { z5.s }, p0/Z, [x20, #1, MUL VL]\n"
+      "fmla z14.s, z4.s, z0.s[2]\n"
+      "fmla z17.s, z4.s, z0.s[3]\n"
+      "ld1w { z6.s }, p0/Z, [x20, #2, MUL VL]\n"
+      "fmla z20.s, z4.s, z1.s[0]\n"
+      "fmla z23.s, z4.s, z1.s[1]\n"
+      "ld1rqw { z2.s }, p0/Z, [%x[Apanel], #32]\n"
+      "fmla z26.s, z4.s, z1.s[2]\n"
+      "fmla z29.s, z4.s, z1.s[3]\n"
+      "ld1rqw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
+      "fmla z9.s, z5.s, z0.s[0]\n"
+      "fmla z12.s, z5.s, z0.s[1]\n"
+      "ld1w { z4.s }, p0/Z, [x20, #3, MUL VL]\n"
+      "fmla z15.s, z5.s, z0.s[2]\n"
+      "fmla z18.s, z5.s, z0.s[3]\n"
+      "sub x19, x19, #0x2\n"
+      "fmla z21.s, z5.s, z1.s[0]\n"
+      "fmla z24.s, z5.s, z1.s[1]\n"
+      "cmp x19, #0x2\n"
+      "fmla z27.s, z5.s, z1.s[2]\n"
+      "fmla z30.s, z5.s, z1.s[3]\n"
+      "ld1w { z5.s }, p0/Z, [x20, #4, MUL VL]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z13.s, z6.s, z0.s[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "fmla z16.s, z6.s, z0.s[2]\n"
+      "fmla z19.s, z6.s, z0.s[3]\n"
+      "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
+      "fmla z22.s, z6.s, z1.s[0]\n"
+      "fmla z25.s, z6.s, z1.s[1]\n"
+      "fmla z28.s, z6.s, z1.s[2]\n"
+      "fmla z31.s, z6.s, z1.s[3]\n"
+      "ld1w { z6.s }, p0/Z, [x20, #5, MUL VL]\n"
+      "fmla z8.s, z4.s, z2.s[0]\n"
+      "fmla z11.s, z4.s, z2.s[1]\n"
+      "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
+      "fmla z14.s, z4.s, z2.s[2]\n"
+      "fmla z17.s, z4.s, z2.s[3]\n"
+      "addvl x20, x20, #6\n"
+      "fmla z20.s, z4.s, z3.s[0]\n"
+      "fmla z23.s, z4.s, z3.s[1]\n"
+      "fmla z26.s, z4.s, z3.s[2]\n"
+      "fmla z29.s, z4.s, z3.s[3]\n"
+      "fmla z9.s, z5.s, z2.s[0]\n"
+      "fmla z12.s, z5.s, z2.s[1]\n"
+      "ld1w { z4.s }, p0/Z, [x20]\n"
+      "fmla z15.s, z5.s, z2.s[2]\n"
+      "fmla z18.s, z5.s, z2.s[3]\n"
+      "fmla z21.s, z5.s, z3.s[0]\n"
+      "fmla z24.s, z5.s, z3.s[1]\n"
+      "fmla z27.s, z5.s, z3.s[2]\n"
+      "fmla z30.s, z5.s, z3.s[3]\n"
+      "fmla z10.s, z6.s, z2.s[0]\n"
+      "fmla z13.s, z6.s, z2.s[1]\n"
+      "fmla z16.s, z6.s, z2.s[2]\n"
+      "fmla z19.s, z6.s, z2.s[3]\n"
+      "fmla z22.s, z6.s, z3.s[0]\n"
+      "fmla z25.s, z6.s, z3.s[1]\n"
+      "fmla z28.s, z6.s, z3.s[2]\n"
+      "fmla z31.s, z6.s, z3.s[3]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "fmla z8.s, z4.s, z0.s[0]\n"
+      "fmla z11.s, z4.s, z0.s[1]\n"
+      "ld1w { z5.s }, p0/Z, [x20, #1, MUL VL]\n"
+      "fmla z14.s, z4.s, z0.s[2]\n"
+      "fmla z17.s, z4.s, z0.s[3]\n"
+      "ld1w { z6.s }, p0/Z, [x20, #2, MUL VL]\n"
+      "fmla z20.s, z4.s, z1.s[0]\n"
+      "fmla z23.s, z4.s, z1.s[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "fmla z26.s, z4.s, z1.s[2]\n"
+      "fmla z29.s, z4.s, z1.s[3]\n"
+      "addvl x20, x20, #3\n"
+      "fmla z9.s, z5.s, z0.s[0]\n"
+      "fmla z12.s, z5.s, z0.s[1]\n"
+      "fmla z15.s, z5.s, z0.s[2]\n"
+      "fmla z18.s, z5.s, z0.s[3]\n"
+      "fmla z21.s, z5.s, z1.s[0]\n"
+      "fmla z24.s, z5.s, z1.s[1]\n"
+      "fmla z27.s, z5.s, z1.s[2]\n"
+      "fmla z30.s, z5.s, z1.s[3]\n"
+      "fmla z10.s, z6.s, z0.s[0]\n"
+      "fmla z13.s, z6.s, z0.s[1]\n"
+      "fmla z16.s, z6.s, z0.s[2]\n"
+      "fmla z19.s, z6.s, z0.s[3]\n"
+      "fmla z22.s, z6.s, z1.s[0]\n"
+      "fmla z25.s, z6.s, z1.s[1]\n"
+      "fmla z28.s, z6.s, z1.s[2]\n"
+      "fmla z31.s, z6.s, z1.s[3]\n"
+      "cbz x19, 5f\n"
+      "ld1rqw { z0.s }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqw { z1.s }, p0/Z, [%x[Apanel], #16]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ld1w { z7.s }, p0/Z, [x20]\n"
+      "ld1w { z4.s }, p0/Z, [x20, #1, MUL VL]\n"
+      "ld1w { z5.s }, p0/Z, [x20, #2, MUL VL]\n"
+      "addvl x20, x20, #3\n"
+      "fmla z8.s, z7.s, z0.s[0]\n"
+      "fmla z11.s, z7.s, z0.s[1]\n"
+      "fmla z14.s, z7.s, z0.s[2]\n"
+      "fmla z17.s, z7.s, z0.s[3]\n"
+      "fmla z20.s, z7.s, z1.s[0]\n"
+      "fmla z23.s, z7.s, z1.s[1]\n"
+      "fmla z26.s, z7.s, z1.s[2]\n"
+      "fmla z29.s, z7.s, z1.s[3]\n"
+      "fmla z9.s, z4.s, z0.s[0]\n"
+      "fmla z12.s, z4.s, z0.s[1]\n"
+      "fmla z15.s, z4.s, z0.s[2]\n"
+      "fmla z18.s, z4.s, z0.s[3]\n"
+      "fmla z21.s, z4.s, z1.s[0]\n"
+      "fmla z24.s, z4.s, z1.s[1]\n"
+      "fmla z27.s, z4.s, z1.s[2]\n"
+      "fmla z30.s, z4.s, z1.s[3]\n"
+      "fmla z10.s, z5.s, z0.s[0]\n"
+      "fmla z13.s, z5.s, z0.s[1]\n"
+      "fmla z16.s, z5.s, z0.s[2]\n"
+      "fmla z19.s, z5.s, z0.s[3]\n"
+      "fmla z22.s, z5.s, z1.s[0]\n"
+      "fmla z25.s, z5.s, z1.s[1]\n"
+      "fmla z28.s, z5.s, z1.s[2]\n"
+      "fmla z31.s, z5.s, z1.s[3]\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x22, x22, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
index a985a91b90..a50cd95157 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
@@ -394,4 +394,4 @@ void sve_interleaved_fp32_mmla_8x3VL(const float *Apanel, const float *Bpanel, f
 
 } // namespace arm_gemm
 
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
index aa6d9e7ec8..1924b2cf75 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,64 +10,103 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
 
 #ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include <cstdint>
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const int8_t *, const int8_t *, \
+    int32_t *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_s8s32_dot_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void sve_interleaved_s8s32_dot_8x3VL( ARGLIST );
 
-class cls_sve_interleaved_s8s32_dot_8x3VL {
+class cls_sve_interleaved_s8s32_dot_8x3VL
+{
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
 
-    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
     static unsigned int out_width()
     {
         return get_vector_length<int32_t>() * 3;
     }
 
-    static unsigned int out_height()
+    static unsigned int stripe_width()
     {
-        return 8;
+        return get_vector_length<int32_t>();
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 4;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
     StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_s8s32_dot_8x3VL;
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.66, 4.10, 7.99 };
+                case CPUModel::V1:
+                    return { 63.30, 4.97, 11.35 };
+                case CPUModel::A510:
+                    return { 27.42, 3.47, 2.88 };
+            }
+        }
+
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.67, 3.57, 0.50 };
+                case CPUModel::V1:
+                    return { 63.35, 4.76, 0.77 };
+                case CPUModel::A510:
+                    return { 27.47, 1.70, 0.28 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_s8s32_dot_8x3VL;
     cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp
new file mode 100644
index 0000000000..5ca4b73b8a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/a64fx.cpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_interleaved_s8s32_dot_8x3VL_a64fx(
+    const int8_t *Apanel, const int8_t *Bpanel,
+    int32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const int8_t *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "ld1b { z0.b }, p0/Z, [x20]\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "ld1b { z1.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "ld1b { z2.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "sdot z8.s, z0.b, z3.b\n"
+      "sdot z9.s, z1.b, z3.b\n"
+      "sub x19, x19, #0x2\n"
+      "sdot z10.s, z2.b, z3.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "sdot z11.s, z0.b, z4.b\n"
+      "sdot z12.s, z1.b, z4.b\n"
+      "sdot z13.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "sdot z14.s, z0.b, z5.b\n"
+      "sdot z15.s, z1.b, z5.b\n"
+      "cmp x19, #0x2\n"
+      "sdot z16.s, z2.b, z5.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "sdot z17.s, z0.b, z6.b\n"
+      "sdot z18.s, z1.b, z6.b\n"
+      "sdot z19.s, z2.b, z6.b\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
+      "sdot z20.s, z0.b, z3.b\n"
+      "sdot z21.s, z1.b, z3.b\n"
+      "sdot z22.s, z2.b, z3.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
+      "sdot z23.s, z0.b, z4.b\n"
+      "sdot z24.s, z1.b, z4.b\n"
+      "sdot z25.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
+      "sdot z26.s, z0.b, z5.b\n"
+      "sdot z27.s, z1.b, z5.b\n"
+      "sdot z28.s, z2.b, z5.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n"
+      "sdot z29.s, z0.b, z6.b\n"
+      "ld1b { z0.b }, p0/Z, [x20, #3, MUL VL]\n"
+      "sdot z30.s, z1.b, z6.b\n"
+      "sdot z31.s, z2.b, z6.b\n"
+      "ld1b { z1.b }, p0/Z, [x20, #4, MUL VL]\n"
+      "ld1b { z2.b }, p0/Z, [x20, #5, MUL VL]\n"
+      "sdot z8.s, z0.b, z3.b\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n"
+      "sdot z9.s, z1.b, z3.b\n"
+      "sdot z10.s, z2.b, z3.b\n"
+      "sdot z11.s, z0.b, z4.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
+      "sdot z12.s, z1.b, z4.b\n"
+      "sdot z13.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
+      "sdot z14.s, z0.b, z5.b\n"
+      "sdot z15.s, z1.b, z5.b\n"
+      "addvl x20, x20, #6\n"
+      "sdot z16.s, z2.b, z5.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n"
+      "sdot z17.s, z0.b, z6.b\n"
+      "sdot z18.s, z1.b, z6.b\n"
+      "sdot z19.s, z2.b, z6.b\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "sdot z20.s, z0.b, z3.b\n"
+      "sdot z21.s, z1.b, z3.b\n"
+      "sdot z22.s, z2.b, z3.b\n"
+      "sdot z23.s, z0.b, z4.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "sdot z24.s, z1.b, z4.b\n"
+      "sdot z25.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "sdot z26.s, z0.b, z5.b\n"
+      "sdot z27.s, z1.b, z5.b\n"
+      "sdot z28.s, z2.b, z5.b\n"
+      "sdot z29.s, z0.b, z6.b\n"
+      "ld1b { z0.b }, p0/Z, [x20]\n"
+      "sdot z30.s, z1.b, z6.b\n"
+      "sdot z31.s, z2.b, z6.b\n"
+      "ld1b { z1.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "ld1b { z2.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "sdot z8.s, z0.b, z3.b\n"
+      "sdot z9.s, z1.b, z3.b\n"
+      "addvl x20, x20, #3\n"
+      "sdot z10.s, z2.b, z3.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "sdot z11.s, z0.b, z4.b\n"
+      "sdot z12.s, z1.b, z4.b\n"
+      "sdot z13.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "sdot z14.s, z0.b, z5.b\n"
+      "sdot z15.s, z1.b, z5.b\n"
+      "sdot z16.s, z2.b, z5.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "sdot z17.s, z0.b, z6.b\n"
+      "sdot z18.s, z1.b, z6.b\n"
+      "sdot z19.s, z2.b, z6.b\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
+      "sdot z20.s, z0.b, z3.b\n"
+      "sdot z21.s, z1.b, z3.b\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "sdot z22.s, z2.b, z3.b\n"
+      "sdot z23.s, z0.b, z4.b\n"
+      "sdot z24.s, z1.b, z4.b\n"
+      "sdot z25.s, z2.b, z4.b\n"
+      "sdot z26.s, z0.b, z5.b\n"
+      "sdot z27.s, z1.b, z5.b\n"
+      "sdot z28.s, z2.b, z5.b\n"
+      "sdot z29.s, z0.b, z6.b\n"
+      "sdot z30.s, z1.b, z6.b\n"
+      "sdot z31.s, z2.b, z6.b\n"
+      "cbz x19, 5f\n"
+      "ld1b { z0.b }, p0/Z, [x20]\n"
+      "ld1b { z1.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "ld1b { z2.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "sdot z8.s, z0.b, z3.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "sdot z9.s, z1.b, z3.b\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "sdot z10.s, z2.b, z3.b\n"
+      "sdot z11.s, z0.b, z4.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "sdot z12.s, z1.b, z4.b\n"
+      "sdot z13.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "sdot z14.s, z0.b, z5.b\n"
+      "sdot z15.s, z1.b, z5.b\n"
+      "sdot z16.s, z2.b, z5.b\n"
+      "sdot z17.s, z0.b, z6.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "sdot z18.s, z1.b, z6.b\n"
+      "sdot z19.s, z2.b, z6.b\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
+      "addvl x20, x20, #3\n"
+      "sdot z20.s, z0.b, z3.b\n"
+      "sdot z21.s, z1.b, z3.b\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "sdot z22.s, z2.b, z3.b\n"
+      "sdot z23.s, z0.b, z4.b\n"
+      "sdot z24.s, z1.b, z4.b\n"
+      "sdot z25.s, z2.b, z4.b\n"
+      "sdot z26.s, z0.b, z5.b\n"
+      "sdot z27.s, z1.b, z5.b\n"
+      "sdot z28.s, z2.b, z5.b\n"
+      "sdot z29.s, z0.b, z6.b\n"
+      "sdot z30.s, z1.b, z6.b\n"
+      "sdot z31.s, z2.b, z6.b\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x22, x22, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
index 01c0f8cddc..5fb938b20f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,320 +10,237 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #ifdef ARM_COMPUTE_ENABLE_SVE
 
+#include <cstddef>
 #include <cstdint>
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void sve_interleaved_s8s32_dot_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const int8_t *a_ptr = Apanel;
-    int32_t *c_ptr = Cpanel;
+void sve_interleaved_s8s32_dot_8x3VL(
+    const int8_t *Apanel, const int8_t *Bpanel,
+    int32_t *Cpanel, int ablocks, int bblocks, int K) {
 
-    K /= 4;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const int8_t *Bpanel = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const int8_t *a_ptr0 = a_ptr;
-        const int8_t *b_ptr = Bpanel;
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.b\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z17.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "mov z18.s, #0\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                "sdot z8.s, z4.b, z0.b[0]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "sdot z9.s, z4.b, z0.b[1]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                "sdot z10.s, z4.b, z0.b[2]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                "sdot z11.s, z4.b, z0.b[3]\n"
-                "sdot z20.s, z4.b, z1.b[0]\n"
-                "sdot z21.s, z4.b, z1.b[1]\n"
-                "sdot z22.s, z4.b, z1.b[2]\n"
-                "sdot z23.s, z4.b, z1.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "sdot z12.s, z5.b, z0.b[0]\n"
-                "sdot z13.s, z5.b, z0.b[1]\n"
-                "sdot z14.s, z5.b, z0.b[2]\n"
-                "sdot z15.s, z5.b, z0.b[3]\n"
-                "sdot z24.s, z5.b, z1.b[0]\n"
-                "sdot z25.s, z5.b, z1.b[1]\n"
-                "sdot z26.s, z5.b, z1.b[2]\n"
-                "sdot z27.s, z5.b, z1.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "sdot z16.s, z6.b, z0.b[0]\n"
-                "sdot z17.s, z6.b, z0.b[1]\n"
-                "sdot z18.s, z6.b, z0.b[2]\n"
-                "sdot z19.s, z6.b, z0.b[3]\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "sdot z28.s, z6.b, z1.b[0]\n"
-                "sdot z29.s, z6.b, z1.b[1]\n"
-                "sdot z30.s, z6.b, z1.b[2]\n"
-                "sdot z31.s, z6.b, z1.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "sdot z8.s, z4.b, z2.b[0]\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "sdot z9.s, z4.b, z2.b[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "sdot z10.s, z4.b, z2.b[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "sdot z11.s, z4.b, z2.b[3]\n"
-                "sdot z20.s, z4.b, z3.b[0]\n"
-                "sdot z21.s, z4.b, z3.b[1]\n"
-                "sdot z22.s, z4.b, z3.b[2]\n"
-                "sdot z23.s, z4.b, z3.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "sdot z12.s, z5.b, z2.b[0]\n"
-                "sdot z13.s, z5.b, z2.b[1]\n"
-                "sdot z14.s, z5.b, z2.b[2]\n"
-                "sdot z15.s, z5.b, z2.b[3]\n"
-                "sdot z24.s, z5.b, z3.b[0]\n"
-                "sdot z25.s, z5.b, z3.b[1]\n"
-                "sdot z26.s, z5.b, z3.b[2]\n"
-                "sdot z27.s, z5.b, z3.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "sdot z16.s, z6.b, z2.b[0]\n"
-                "sdot z17.s, z6.b, z2.b[1]\n"
-                "sdot z18.s, z6.b, z2.b[2]\n"
-                "sdot z19.s, z6.b, z2.b[3]\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                "sdot z28.s, z6.b, z3.b[0]\n"
-                "sdot z29.s, z6.b, z3.b[1]\n"
-                "sdot z30.s, z6.b, z3.b[2]\n"
-                "sdot z31.s, z6.b, z3.b[3]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                "sdot z8.s, z4.b, z0.b[0]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "sdot z9.s, z4.b, z0.b[1]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                "sdot z10.s, z4.b, z0.b[2]\n"
-                "sdot z11.s, z4.b, z0.b[3]\n"
-                "sdot z20.s, z4.b, z1.b[0]\n"
-                "sdot z21.s, z4.b, z1.b[1]\n"
-                "sdot z22.s, z4.b, z1.b[2]\n"
-                "sdot z23.s, z4.b, z1.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "sdot z12.s, z5.b, z0.b[0]\n"
-                "sdot z13.s, z5.b, z0.b[1]\n"
-                "sdot z14.s, z5.b, z0.b[2]\n"
-                "sdot z15.s, z5.b, z0.b[3]\n"
-                "sdot z24.s, z5.b, z1.b[0]\n"
-                "sdot z25.s, z5.b, z1.b[1]\n"
-                "sdot z26.s, z5.b, z1.b[2]\n"
-                "sdot z27.s, z5.b, z1.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "sdot z16.s, z6.b, z0.b[0]\n"
-                "sdot z17.s, z6.b, z0.b[1]\n"
-                "sdot z18.s, z6.b, z0.b[2]\n"
-                "sdot z19.s, z6.b, z0.b[3]\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "sdot z28.s, z6.b, z1.b[0]\n"
-                "sdot z29.s, z6.b, z1.b[1]\n"
-                "sdot z30.s, z6.b, z1.b[2]\n"
-                "sdot z31.s, z6.b, z1.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "sdot z8.s, z4.b, z2.b[0]\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "sdot z9.s, z4.b, z2.b[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                "sdot z10.s, z4.b, z2.b[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "sdot z11.s, z4.b, z2.b[3]\n"
-                "sdot z20.s, z4.b, z3.b[0]\n"
-                "sdot z21.s, z4.b, z3.b[1]\n"
-                "sdot z22.s, z4.b, z3.b[2]\n"
-                "sdot z23.s, z4.b, z3.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "sdot z12.s, z5.b, z2.b[0]\n"
-                "sdot z13.s, z5.b, z2.b[1]\n"
-                "sdot z14.s, z5.b, z2.b[2]\n"
-                "sdot z15.s, z5.b, z2.b[3]\n"
-                "sdot z24.s, z5.b, z3.b[0]\n"
-                "sdot z25.s, z5.b, z3.b[1]\n"
-                "sdot z26.s, z5.b, z3.b[2]\n"
-                "sdot z27.s, z5.b, z3.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "sdot z16.s, z6.b, z2.b[0]\n"
-                "sdot z17.s, z6.b, z2.b[1]\n"
-                "sdot z18.s, z6.b, z2.b[2]\n"
-                "sdot z19.s, z6.b, z2.b[3]\n"
-                "sdot z28.s, z6.b, z3.b[0]\n"
-                "sdot z29.s, z6.b, z3.b[1]\n"
-                "sdot z30.s, z6.b, z3.b[2]\n"
-                "sdot z31.s, z6.b, z3.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "sdot z8.s, z4.b, z0.b[0]\n"
-                "sdot z9.s, z4.b, z0.b[1]\n"
-                "sdot z10.s, z4.b, z0.b[2]\n"
-                "sdot z11.s, z4.b, z0.b[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                "sdot z20.s, z4.b, z1.b[0]\n"
-                "sdot z21.s, z4.b, z1.b[1]\n"
-                "sdot z22.s, z4.b, z1.b[2]\n"
-                "sdot z23.s, z4.b, z1.b[3]\n"
-                "sdot z12.s, z5.b, z0.b[0]\n"
-                "sdot z13.s, z5.b, z0.b[1]\n"
-                "sdot z14.s, z5.b, z0.b[2]\n"
-                "sdot z15.s, z5.b, z0.b[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "sdot z24.s, z5.b, z1.b[0]\n"
-                "sdot z25.s, z5.b, z1.b[1]\n"
-                "sdot z26.s, z5.b, z1.b[2]\n"
-                "sdot z27.s, z5.b, z1.b[3]\n"
-                "sdot z16.s, z6.b, z0.b[0]\n"
-                "sdot z17.s, z6.b, z0.b[1]\n"
-                "sdot z18.s, z6.b, z0.b[2]\n"
-                "sdot z19.s, z6.b, z0.b[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "sdot z28.s, z6.b, z1.b[0]\n"
-                "sdot z29.s, z6.b, z1.b[1]\n"
-                "sdot z30.s, z6.b, z1.b[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "sdot z31.s, z6.b, z1.b[3]\n"
-                "b 4f\n"
-                "3:\n"
-                "sdot z8.s, z4.b, z0.b[0]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "sdot z9.s, z4.b, z0.b[1]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                "sdot z10.s, z4.b, z0.b[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "sdot z11.s, z4.b, z0.b[3]\n"
-                "sdot z20.s, z4.b, z1.b[0]\n"
-                "sdot z21.s, z4.b, z1.b[1]\n"
-                "sdot z22.s, z4.b, z1.b[2]\n"
-                "sdot z23.s, z4.b, z1.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "sdot z12.s, z5.b, z0.b[0]\n"
-                "sdot z13.s, z5.b, z0.b[1]\n"
-                "sdot z14.s, z5.b, z0.b[2]\n"
-                "sdot z15.s, z5.b, z0.b[3]\n"
-                "sdot z24.s, z5.b, z1.b[0]\n"
-                "sdot z25.s, z5.b, z1.b[1]\n"
-                "sdot z26.s, z5.b, z1.b[2]\n"
-                "sdot z27.s, z5.b, z1.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "sdot z16.s, z6.b, z0.b[0]\n"
-                "sdot z17.s, z6.b, z0.b[1]\n"
-                "sdot z18.s, z6.b, z0.b[2]\n"
-                "sdot z19.s, z6.b, z0.b[3]\n"
-                "sdot z28.s, z6.b, z1.b[0]\n"
-                "sdot z29.s, z6.b, z1.b[1]\n"
-                "sdot z30.s, z6.b, z1.b[2]\n"
-                "sdot z31.s, z6.b, z1.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "sdot z8.s, z4.b, z2.b[0]\n"
-                "sdot z9.s, z4.b, z2.b[1]\n"
-                "sdot z10.s, z4.b, z2.b[2]\n"
-                "sdot z11.s, z4.b, z2.b[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                "sdot z20.s, z4.b, z3.b[0]\n"
-                "sdot z21.s, z4.b, z3.b[1]\n"
-                "sdot z22.s, z4.b, z3.b[2]\n"
-                "sdot z23.s, z4.b, z3.b[3]\n"
-                "sdot z12.s, z5.b, z2.b[0]\n"
-                "sdot z13.s, z5.b, z2.b[1]\n"
-                "sdot z14.s, z5.b, z2.b[2]\n"
-                "sdot z15.s, z5.b, z2.b[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "sdot z24.s, z5.b, z3.b[0]\n"
-                "sdot z25.s, z5.b, z3.b[1]\n"
-                "sdot z26.s, z5.b, z3.b[2]\n"
-                "sdot z27.s, z5.b, z3.b[3]\n"
-                "sdot z16.s, z6.b, z2.b[0]\n"
-                "sdot z17.s, z6.b, z2.b[1]\n"
-                "sdot z18.s, z6.b, z2.b[2]\n"
-                "sdot z19.s, z6.b, z2.b[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "sdot z28.s, z6.b, z3.b[0]\n"
-                "sdot z29.s, z6.b, z3.b[1]\n"
-                "sdot z30.s, z6.b, z3.b[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "sdot z31.s, z6.b, z3.b[3]\n"
-                "4:\n"
-                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "st1w z25.s, p0, [%[c_ptr]]\n"
-                "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "ld1b { z4.b }, p0/Z, [x20]\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov %x[Apanel], x21\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "cmp x19, #0x2\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "sdot z8.s, z4.b, z0.b[0]\n"
+      "sdot z11.s, z4.b, z0.b[1]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "sdot z14.s, z4.b, z0.b[2]\n"
+      "sdot z17.s, z4.b, z0.b[3]\n"
+      "ld1b { z6.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "sdot z20.s, z4.b, z1.b[0]\n"
+      "sdot z23.s, z4.b, z1.b[1]\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
+      "sdot z26.s, z4.b, z1.b[2]\n"
+      "sdot z29.s, z4.b, z1.b[3]\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n"
+      "sdot z9.s, z5.b, z0.b[0]\n"
+      "sdot z12.s, z5.b, z0.b[1]\n"
+      "ld1b { z4.b }, p0/Z, [x20, #3, MUL VL]\n"
+      "sdot z15.s, z5.b, z0.b[2]\n"
+      "sdot z18.s, z5.b, z0.b[3]\n"
+      "sub x19, x19, #0x2\n"
+      "sdot z21.s, z5.b, z1.b[0]\n"
+      "sdot z24.s, z5.b, z1.b[1]\n"
+      "cmp x19, #0x2\n"
+      "sdot z27.s, z5.b, z1.b[2]\n"
+      "sdot z30.s, z5.b, z1.b[3]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #4, MUL VL]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z13.s, z6.b, z0.b[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "sdot z16.s, z6.b, z0.b[2]\n"
+      "sdot z19.s, z6.b, z0.b[3]\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "sdot z22.s, z6.b, z1.b[0]\n"
+      "sdot z25.s, z6.b, z1.b[1]\n"
+      "sdot z28.s, z6.b, z1.b[2]\n"
+      "sdot z31.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p0/Z, [x20, #5, MUL VL]\n"
+      "sdot z8.s, z4.b, z2.b[0]\n"
+      "sdot z11.s, z4.b, z2.b[1]\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "sdot z14.s, z4.b, z2.b[2]\n"
+      "sdot z17.s, z4.b, z2.b[3]\n"
+      "addvl x20, x20, #6\n"
+      "sdot z20.s, z4.b, z3.b[0]\n"
+      "sdot z23.s, z4.b, z3.b[1]\n"
+      "sdot z26.s, z4.b, z3.b[2]\n"
+      "sdot z29.s, z4.b, z3.b[3]\n"
+      "sdot z9.s, z5.b, z2.b[0]\n"
+      "sdot z12.s, z5.b, z2.b[1]\n"
+      "ld1b { z4.b }, p0/Z, [x20]\n"
+      "sdot z15.s, z5.b, z2.b[2]\n"
+      "sdot z18.s, z5.b, z2.b[3]\n"
+      "sdot z21.s, z5.b, z3.b[0]\n"
+      "sdot z24.s, z5.b, z3.b[1]\n"
+      "sdot z27.s, z5.b, z3.b[2]\n"
+      "sdot z30.s, z5.b, z3.b[3]\n"
+      "sdot z10.s, z6.b, z2.b[0]\n"
+      "sdot z13.s, z6.b, z2.b[1]\n"
+      "sdot z16.s, z6.b, z2.b[2]\n"
+      "sdot z19.s, z6.b, z2.b[3]\n"
+      "sdot z22.s, z6.b, z3.b[0]\n"
+      "sdot z25.s, z6.b, z3.b[1]\n"
+      "sdot z28.s, z6.b, z3.b[2]\n"
+      "sdot z31.s, z6.b, z3.b[3]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "sdot z8.s, z4.b, z0.b[0]\n"
+      "sdot z11.s, z4.b, z0.b[1]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "sdot z14.s, z4.b, z0.b[2]\n"
+      "sdot z17.s, z4.b, z0.b[3]\n"
+      "ld1b { z6.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "sdot z20.s, z4.b, z1.b[0]\n"
+      "sdot z23.s, z4.b, z1.b[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "sdot z26.s, z4.b, z1.b[2]\n"
+      "sdot z29.s, z4.b, z1.b[3]\n"
+      "addvl x20, x20, #3\n"
+      "sdot z9.s, z5.b, z0.b[0]\n"
+      "sdot z12.s, z5.b, z0.b[1]\n"
+      "sdot z15.s, z5.b, z0.b[2]\n"
+      "sdot z18.s, z5.b, z0.b[3]\n"
+      "sdot z21.s, z5.b, z1.b[0]\n"
+      "sdot z24.s, z5.b, z1.b[1]\n"
+      "sdot z27.s, z5.b, z1.b[2]\n"
+      "sdot z30.s, z5.b, z1.b[3]\n"
+      "sdot z10.s, z6.b, z0.b[0]\n"
+      "sdot z13.s, z6.b, z0.b[1]\n"
+      "sdot z16.s, z6.b, z0.b[2]\n"
+      "sdot z19.s, z6.b, z0.b[3]\n"
+      "sdot z22.s, z6.b, z1.b[0]\n"
+      "sdot z25.s, z6.b, z1.b[1]\n"
+      "sdot z28.s, z6.b, z1.b[2]\n"
+      "sdot z31.s, z6.b, z1.b[3]\n"
+      "cbz x19, 5f\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ld1b { z7.b }, p0/Z, [x20]\n"
+      "ld1b { z4.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "addvl x20, x20, #3\n"
+      "sdot z8.s, z7.b, z0.b[0]\n"
+      "sdot z11.s, z7.b, z0.b[1]\n"
+      "sdot z14.s, z7.b, z0.b[2]\n"
+      "sdot z17.s, z7.b, z0.b[3]\n"
+      "sdot z20.s, z7.b, z1.b[0]\n"
+      "sdot z23.s, z7.b, z1.b[1]\n"
+      "sdot z26.s, z7.b, z1.b[2]\n"
+      "sdot z29.s, z7.b, z1.b[3]\n"
+      "sdot z9.s, z4.b, z0.b[0]\n"
+      "sdot z12.s, z4.b, z0.b[1]\n"
+      "sdot z15.s, z4.b, z0.b[2]\n"
+      "sdot z18.s, z4.b, z0.b[3]\n"
+      "sdot z21.s, z4.b, z1.b[0]\n"
+      "sdot z24.s, z4.b, z1.b[1]\n"
+      "sdot z27.s, z4.b, z1.b[2]\n"
+      "sdot z30.s, z4.b, z1.b[3]\n"
+      "sdot z10.s, z5.b, z0.b[0]\n"
+      "sdot z13.s, z5.b, z0.b[1]\n"
+      "sdot z16.s, z5.b, z0.b[2]\n"
+      "sdot z19.s, z5.b, z0.b[3]\n"
+      "sdot z22.s, z5.b, z1.b[0]\n"
+      "sdot z25.s, z5.b, z1.b[1]\n"
+      "sdot z28.s, z5.b, z1.b[2]\n"
+      "sdot z31.s, z5.b, z1.b[3]\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x22, x22, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
index 671946b262..bd1764bb7f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,64 +10,103 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
 
 #ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include <cstdint>
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const int8_t *, const int8_t *, \
+    int32_t *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void sve_interleaved_s8s32_mmla_8x3VL( ARGLIST );
 
-class cls_sve_interleaved_s8s32_mmla_8x3VL {
+class cls_sve_interleaved_s8s32_mmla_8x3VL
+{
 public:
     typedef int8_t operand_type;
     typedef int32_t result_type;
 
-    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
     static unsigned int out_width()
     {
         return get_vector_length<int32_t>() * 3;
     }
 
-    static unsigned int out_height()
+    static unsigned int stripe_width()
     {
-        return 8;
+        return get_vector_length<int32_t>();
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 8;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
     StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_s8s32_mmla_8x3VL;
+        if (std::is_same<T, int32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 61.98, 3.90, 7.94 };
+                case CPUModel::V1:
+                    return { 123.42, 5.00, 11.52 };
+                case CPUModel::A510:
+                    return { 43.14, 3.62, 2.90 };
+            }
+        }
+
+
+        if (std::is_same<T, int8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 61.97, 3.64, 0.50 };
+                case CPUModel::V1:
+                    return { 123.84, 4.93, 0.76 };
+                case CPUModel::A510:
+                    return { 43.36, 1.86, 0.28 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_s8s32_mmla_8x3VL;
     cls_sve_interleaved_s8s32_mmla_8x3VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
index 9420210aae..b8f1864af3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,375 +23,271 @@
  */
 #ifdef ARM_COMPUTE_ENABLE_SVE
 
+#include <cstddef>
 #include <cstdint>
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const int8_t *a_ptr = Apanel;
-    int32_t *c_ptr = Cpanel;
+void sve_interleaved_s8s32_mmla_8x3VL(
+    const int8_t *Apanel, const int8_t *Bpanel,
+    int32_t *Cpanel, int ablocks, int bblocks, int K) {
 
-    K /= 8;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const int8_t *Bpanel = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const int8_t *a_ptr0 = a_ptr;
-        const int8_t *b_ptr = Bpanel;
+    ka.bblocks = bblocks;
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.b\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "mov z17.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z18.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #4\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
-                ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
-                ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
-                ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n"
-                ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
-                ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
-                ".inst 0x4506987c // smmla z28.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
-                ".inst 0x45079831 // smmla z17.s, z1.b, z7.b\n"
-                ".inst 0x45079857 // smmla z23.s, z2.b, z7.b\n"
-                ".inst 0x4507987d // smmla z29.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                ".inst 0x4504980c // smmla z12.s, z0.b, z4.b\n"
-                ".inst 0x45049832 // smmla z18.s, z1.b, z4.b\n"
-                ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
-                ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                ".inst 0x4505980d // smmla z13.s, z0.b, z5.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                ".inst 0x4505987f // smmla z31.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #5, MUL VL]\n"
-                ".inst 0x45069808 // smmla z8.s, z0.b, z6.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n"
-                ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
-                "addvl %[b_ptr], %[b_ptr], #12\n"
-                ".inst 0x4506987a // smmla z26.s, z3.b, z6.b\n"
-                ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
-                ".inst 0x4507982f // smmla z15.s, z1.b, z7.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x45079855 // smmla z21.s, z2.b, z7.b\n"
-                ".inst 0x4507987b // smmla z27.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x4504980a // smmla z10.s, z0.b, z4.b\n"
-                ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
-                ".inst 0x45049856 // smmla z22.s, z2.b, z4.b\n"
-                ".inst 0x4504987c // smmla z28.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x4505980b // smmla z11.s, z0.b, z5.b\n"
-                ".inst 0x45059831 // smmla z17.s, z1.b, z5.b\n"
-                ".inst 0x45059857 // smmla z23.s, z2.b, z5.b\n"
-                ".inst 0x4505987d // smmla z29.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
-                ".inst 0x45069832 // smmla z18.s, z1.b, z6.b\n"
-                ".inst 0x45069858 // smmla z24.s, z2.b, z6.b\n"
-                ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x4507980d // smmla z13.s, z0.b, z7.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x45079833 // smmla z19.s, z1.b, z7.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4507987f // smmla z31.s, z3.b, z7.b\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
-                ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
-                ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
-                ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
-                ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n"
-                ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
-                ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
-                ".inst 0x4506987c // smmla z28.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
-                ".inst 0x45079831 // smmla z17.s, z1.b, z7.b\n"
-                ".inst 0x45079857 // smmla z23.s, z2.b, z7.b\n"
-                ".inst 0x4507987d // smmla z29.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                ".inst 0x4504980c // smmla z12.s, z0.b, z4.b\n"
-                ".inst 0x45049832 // smmla z18.s, z1.b, z4.b\n"
-                ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
-                ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                ".inst 0x4505980d // smmla z13.s, z0.b, z5.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                ".inst 0x4505987f // smmla z31.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #5, MUL VL]\n"
-                ".inst 0x45069808 // smmla z8.s, z0.b, z6.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n"
-                ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
-                "addvl %[b_ptr], %[b_ptr], #14\n"
-                ".inst 0x4506987a // smmla z26.s, z3.b, z6.b\n"
-                ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
-                ".inst 0x4507982f // smmla z15.s, z1.b, z7.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
-                ".inst 0x45079855 // smmla z21.s, z2.b, z7.b\n"
-                ".inst 0x4507987b // smmla z27.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-7, MUL VL]\n"
-                ".inst 0x4504980a // smmla z10.s, z0.b, z4.b\n"
-                ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
-                ".inst 0x45049856 // smmla z22.s, z2.b, z4.b\n"
-                ".inst 0x4504987c // smmla z28.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x4505980b // smmla z11.s, z0.b, z5.b\n"
-                ".inst 0x45059831 // smmla z17.s, z1.b, z5.b\n"
-                ".inst 0x45059857 // smmla z23.s, z2.b, z5.b\n"
-                ".inst 0x4505987d // smmla z29.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
-                ".inst 0x45069832 // smmla z18.s, z1.b, z6.b\n"
-                ".inst 0x45069858 // smmla z24.s, z2.b, z6.b\n"
-                ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x4507980d // smmla z13.s, z0.b, z7.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x45079833 // smmla z19.s, z1.b, z7.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4507987f // smmla z31.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
-                ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
-                ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
-                ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
-                ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
-                ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n"
-                ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
-                ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
-                ".inst 0x4506987c // smmla z28.s, z3.b, z6.b\n"
-                "uzp1 z6.d, z14.d, z15.d\n"
-                ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
-                ".inst 0x45079831 // smmla z17.s, z1.b, z7.b\n"
-                ".inst 0x45079857 // smmla z23.s, z2.b, z7.b\n"
-                ".inst 0x4507987d // smmla z29.s, z3.b, z7.b\n"
-                ".inst 0x4504980c // smmla z12.s, z0.b, z4.b\n"
-                "uzp1 z7.d, z16.d, z17.d\n"
-                ".inst 0x45049832 // smmla z18.s, z1.b, z4.b\n"
-                ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
-                ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n"
-                "uzp2 z4.d, z10.d, z11.d\n"
-                ".inst 0x4505980d // smmla z13.s, z0.b, z5.b\n"
-                "uzp1 z0.d, z8.d, z9.d\n"
-                ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
-                "uzp1 z1.d, z10.d, z11.d\n"
-                ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                "uzp1 z2.d, z12.d, z13.d\n"
-                "uzp1 z0.d, z18.d, z19.d\n"
-                ".inst 0x4505987f // smmla z31.s, z3.b, z5.b\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "uzp2 z3.d, z8.d, z9.d\n"
-                "uzp2 z5.d, z12.d, z13.d\n"
-                "uzp2 z1.d, z14.d, z15.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
-                "addvl %[b_ptr], %[b_ptr], #8\n"
-                ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
-                ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
-                ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
-                ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-7, MUL VL]\n"
-                ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n"
-                ".inst 0x45069830 // smmla z16.s, z1.b, z6.b\n"
-                ".inst 0x45069856 // smmla z22.s, z2.b, z6.b\n"
-                ".inst 0x4506987c // smmla z28.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x4507980b // smmla z11.s, z0.b, z7.b\n"
-                ".inst 0x45079831 // smmla z17.s, z1.b, z7.b\n"
-                ".inst 0x45079857 // smmla z23.s, z2.b, z7.b\n"
-                ".inst 0x4507987d // smmla z29.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x4504980c // smmla z12.s, z0.b, z4.b\n"
-                ".inst 0x45049832 // smmla z18.s, z1.b, z4.b\n"
-                ".inst 0x45049858 // smmla z24.s, z2.b, z4.b\n"
-                ".inst 0x4504987e // smmla z30.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x4505980d // smmla z13.s, z0.b, z5.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x45059833 // smmla z19.s, z1.b, z5.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x45059859 // smmla z25.s, z2.b, z5.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x4505987f // smmla z31.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x45069808 // smmla z8.s, z0.b, z6.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x4506982e // smmla z14.s, z1.b, z6.b\n"
-                ".inst 0x45069854 // smmla z20.s, z2.b, z6.b\n"
-                ".inst 0x45079809 // smmla z9.s, z0.b, z7.b\n"
-                ".inst 0x4506987a // smmla z26.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x4507982f // smmla z15.s, z1.b, z7.b\n"
-                ".inst 0x45079855 // smmla z21.s, z2.b, z7.b\n"
-                ".inst 0x4507987b // smmla z27.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x4504980a // smmla z10.s, z0.b, z4.b\n"
-                ".inst 0x45049830 // smmla z16.s, z1.b, z4.b\n"
-                ".inst 0x45049856 // smmla z22.s, z2.b, z4.b\n"
-                ".inst 0x4504987c // smmla z28.s, z3.b, z4.b\n"
-                ".inst 0x4505980b // smmla z11.s, z0.b, z5.b\n"
-                ".inst 0x45059831 // smmla z17.s, z1.b, z5.b\n"
-                ".inst 0x45059857 // smmla z23.s, z2.b, z5.b\n"
-                ".inst 0x4505987d // smmla z29.s, z3.b, z5.b\n"
-                "uzp2 z4.d, z10.d, z11.d\n"
-                ".inst 0x4506980c // smmla z12.s, z0.b, z6.b\n"
-                ".inst 0x45069832 // smmla z18.s, z1.b, z6.b\n"
-                ".inst 0x45069858 // smmla z24.s, z2.b, z6.b\n"
-                ".inst 0x4506987e // smmla z30.s, z3.b, z6.b\n"
-                "uzp1 z6.d, z14.d, z15.d\n"
-                ".inst 0x4507980d // smmla z13.s, z0.b, z7.b\n"
-                "uzp1 z0.d, z8.d, z9.d\n"
-                ".inst 0x45079833 // smmla z19.s, z1.b, z7.b\n"
-                "uzp1 z1.d, z10.d, z11.d\n"
-                "uzp2 z5.d, z12.d, z13.d\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
-                "uzp1 z2.d, z12.d, z13.d\n"
-                "uzp1 z0.d, z18.d, z19.d\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "uzp2 z1.d, z14.d, z15.d\n"
-                ".inst 0x4507987f // smmla z31.s, z3.b, z7.b\n"
-                "uzp2 z3.d, z8.d, z9.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "uzp1 z7.d, z16.d, z17.d\n"
-                "4:\n"
-                "uzp2 z2.d, z16.d, z17.d\n"
-                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "uzp2 z3.d, z18.d, z19.d\n"
-                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "uzp1 z4.d, z20.d, z21.d\n"
-                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "uzp1 z5.d, z22.d, z23.d\n"
-                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "uzp1 z6.d, z24.d, z25.d\n"
-                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "uzp2 z7.d, z20.d, z21.d\n"
-                "st1w z0.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "uzp2 z0.d, z22.d, z23.d\n"
-                "st1w z1.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "uzp2 z1.d, z24.d, z25.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "uzp1 z2.d, z26.d, z27.d\n"
-                "st1w z3.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "uzp1 z3.d, z28.d, z29.d\n"
-                "st1w z4.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "uzp1 z4.d, z30.d, z31.d\n"
-                "st1w z5.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "uzp2 z5.d, z26.d, z27.d\n"
-                "st1w z6.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "uzp2 z6.d, z28.d, z29.d\n"
-                "st1w z7.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "uzp2 z7.d, z30.d, z31.d\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "ld1b { z4.b }, p0/Z, [x20]\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "ld1b { z5.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "addvl x20, x20, #2\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x45049808  // smmla z8.s, z0.b, z4.b\n"
+      ".inst 0x4504982e  // smmla z14.s, z1.b, z4.b\n"
+      ".inst 0x4505980b  // smmla z11.s, z0.b, z5.b\n"
+      ".inst 0x45059831  // smmla z17.s, z1.b, z5.b\n"
+      "ld1b { z6.b }, p0/Z, [x20]\n"
+      ".inst 0x45049854  // smmla z20.s, z2.b, z4.b\n"
+      ".inst 0x45059857  // smmla z23.s, z2.b, z5.b\n"
+      "ld1b { z7.b }, p0/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x4504987a  // smmla z26.s, z3.b, z4.b\n"
+      ".inst 0x4505987d  // smmla z29.s, z3.b, z5.b\n"
+      "ld1b { z4.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #3, MUL VL]\n"
+      ".inst 0x45069809  // smmla z9.s, z0.b, z6.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x4506987b  // smmla z27.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p0/Z, [x20, #4, MUL VL]\n"
+      ".inst 0x4507980c  // smmla z12.s, z0.b, z7.b\n"
+      ".inst 0x4504980a  // smmla z10.s, z0.b, z4.b\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x4505980d  // smmla z13.s, z0.b, z5.b\n"
+      ".inst 0x45079832  // smmla z18.s, z1.b, z7.b\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #16]\n"
+      ".inst 0x45049830  // smmla z16.s, z1.b, z4.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #32]\n"
+      ".inst 0x45079858  // smmla z24.s, z2.b, z7.b\n"
+      ".inst 0x4507987e  // smmla z30.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p0/Z, [x20, #5, MUL VL]\n"
+      ".inst 0x45049856  // smmla z22.s, z2.b, z4.b\n"
+      ".inst 0x45059859  // smmla z25.s, z2.b, z5.b\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x4504987c  // smmla z28.s, z3.b, z4.b\n"
+      ".inst 0x4505987f  // smmla z31.s, z3.b, z5.b\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #64]\n"
+      "ld1b { z4.b }, p0/Z, [x20, #6, MUL VL]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      ".inst 0x45069808  // smmla z8.s, z0.b, z6.b\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079831  // smmla z17.s, z1.b, z7.b\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      ".inst 0x45079857  // smmla z23.s, z2.b, z7.b\n"
+      ".inst 0x4506987a  // smmla z26.s, z3.b, z6.b\n"
+      ".inst 0x4507987d  // smmla z29.s, z3.b, z7.b\n"
+      "ld1b { z6.b }, p0/Z, [x20, #-8, MUL VL]\n"
+      "ld1b { z7.b }, p0/Z, [x20, #-7, MUL VL]\n"
+      ".inst 0x45049809  // smmla z9.s, z0.b, z4.b\n"
+      ".inst 0x4504982f  // smmla z15.s, z1.b, z4.b\n"
+      ".inst 0x45049855  // smmla z21.s, z2.b, z4.b\n"
+      ".inst 0x4504987b  // smmla z27.s, z3.b, z4.b\n"
+      "ld1b { z4.b }, p0/Z, [x20, #-6, MUL VL]\n"
+      ".inst 0x4505980c  // smmla z12.s, z0.b, z5.b\n"
+      ".inst 0x4506980a  // smmla z10.s, z0.b, z6.b\n"
+      ".inst 0x4507980d  // smmla z13.s, z0.b, z7.b\n"
+      ".inst 0x45059832  // smmla z18.s, z1.b, z5.b\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #80]\n"
+      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
+      ".inst 0x45079833  // smmla z19.s, z1.b, z7.b\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #96]\n"
+      ".inst 0x45059858  // smmla z24.s, z2.b, z5.b\n"
+      ".inst 0x4505987e  // smmla z30.s, z3.b, z5.b\n"
+      "ld1b { z5.b }, p0/Z, [x20, #-5, MUL VL]\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      ".inst 0x45079859  // smmla z25.s, z2.b, z7.b\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n"
+      ".inst 0x4506987c  // smmla z28.s, z3.b, z6.b\n"
+      ".inst 0x4507987f  // smmla z31.s, z3.b, z7.b\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "addvl x20, x20, #-4\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x45049808  // smmla z8.s, z0.b, z4.b\n"
+      ".inst 0x4504982e  // smmla z14.s, z1.b, z4.b\n"
+      ".inst 0x4505980b  // smmla z11.s, z0.b, z5.b\n"
+      ".inst 0x45059831  // smmla z17.s, z1.b, z5.b\n"
+      "ld1b { z6.b }, p0/Z, [x20]\n"
+      ".inst 0x45049854  // smmla z20.s, z2.b, z4.b\n"
+      ".inst 0x45059857  // smmla z23.s, z2.b, z5.b\n"
+      "ld1b { z7.b }, p0/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x4504987a  // smmla z26.s, z3.b, z4.b\n"
+      ".inst 0x4505987d  // smmla z29.s, z3.b, z5.b\n"
+      "ld1b { z4.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #3, MUL VL]\n"
+      ".inst 0x45069809  // smmla z9.s, z0.b, z6.b\n"
+      ".inst 0x4506982f  // smmla z15.s, z1.b, z6.b\n"
+      ".inst 0x45069855  // smmla z21.s, z2.b, z6.b\n"
+      ".inst 0x4506987b  // smmla z27.s, z3.b, z6.b\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x4507980c  // smmla z12.s, z0.b, z7.b\n"
+      ".inst 0x4504980a  // smmla z10.s, z0.b, z4.b\n"
+      "addvl x20, x20, #4\n"
+      ".inst 0x4505980d  // smmla z13.s, z0.b, z5.b\n"
+      ".inst 0x45079832  // smmla z18.s, z1.b, z7.b\n"
+      ".inst 0x45049830  // smmla z16.s, z1.b, z4.b\n"
+      ".inst 0x45059833  // smmla z19.s, z1.b, z5.b\n"
+      ".inst 0x45079858  // smmla z24.s, z2.b, z7.b\n"
+      ".inst 0x4507987e  // smmla z30.s, z3.b, z7.b\n"
+      ".inst 0x45049856  // smmla z22.s, z2.b, z4.b\n"
+      ".inst 0x45059859  // smmla z25.s, z2.b, z5.b\n"
+      ".inst 0x4504987c  // smmla z28.s, z3.b, z4.b\n"
+      ".inst 0x4505987f  // smmla z31.s, z3.b, z5.b\n"
+      "cbz x19, 5f\n"
+      "ld1b { z6.b }, p0/Z, [x20]\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x45069808  // smmla z8.s, z0.b, z6.b\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1b { z7.b }, p0/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x4506982e  // smmla z14.s, z1.b, z6.b\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x4507980b  // smmla z11.s, z0.b, z7.b\n"
+      ".inst 0x45079831  // smmla z17.s, z1.b, z7.b\n"
+      ".inst 0x45069854  // smmla z20.s, z2.b, z6.b\n"
+      "ld1b { z4.b }, p0/Z, [x20, #2, MUL VL]\n"
+      ".inst 0x45079857  // smmla z23.s, z2.b, z7.b\n"
+      ".inst 0x4506987a  // smmla z26.s, z3.b, z6.b\n"
+      "ld1b { z5.b }, p0/Z, [x20, #3, MUL VL]\n"
+      ".inst 0x4507987d  // smmla z29.s, z3.b, z7.b\n"
+      "ld1b { z6.b }, p0/Z, [x20, #4, MUL VL]\n"
+      "ld1b { z7.b }, p0/Z, [x20, #5, MUL VL]\n"
+      ".inst 0x45049809  // smmla z9.s, z0.b, z4.b\n"
+      ".inst 0x4504982f  // smmla z15.s, z1.b, z4.b\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x45049855  // smmla z21.s, z2.b, z4.b\n"
+      ".inst 0x4504987b  // smmla z27.s, z3.b, z4.b\n"
+      "addvl x20, x20, #6\n"
+      ".inst 0x4505980c  // smmla z12.s, z0.b, z5.b\n"
+      ".inst 0x4506980a  // smmla z10.s, z0.b, z6.b\n"
+      ".inst 0x4507980d  // smmla z13.s, z0.b, z7.b\n"
+      ".inst 0x45059832  // smmla z18.s, z1.b, z5.b\n"
+      ".inst 0x45069830  // smmla z16.s, z1.b, z6.b\n"
+      ".inst 0x45079833  // smmla z19.s, z1.b, z7.b\n"
+      ".inst 0x45059858  // smmla z24.s, z2.b, z5.b\n"
+      ".inst 0x4505987e  // smmla z30.s, z3.b, z5.b\n"
+      ".inst 0x45069856  // smmla z22.s, z2.b, z6.b\n"
+      ".inst 0x45079859  // smmla z25.s, z2.b, z7.b\n"
+      ".inst 0x4506987c  // smmla z28.s, z3.b, z6.b\n"
+      ".inst 0x4507987f  // smmla z31.s, z3.b, z7.b\n"
+      "5:"  // multiply loop done
+      "uzp1 z4.d, z8.d, z11.d\n"
+      "uzp2 z8.d, z8.d, z11.d\n"
+      "st1w { z4.s }, p0, [%x[Cpanel]]\n"
+      "uzp1 z11.d, z9.d, z12.d\n"
+      "uzp2 z9.d, z9.d, z12.d\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "uzp1 z12.d, z10.d, z13.d\n"
+      "uzp2 z10.d, z10.d, z13.d\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "uzp1 z13.d, z14.d, z17.d\n"
+      "uzp2 z14.d, z14.d, z17.d\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "uzp1 z17.d, z15.d, z18.d\n"
+      "subs x22, x22, #0x1\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "uzp2 z15.d, z15.d, z18.d\n"
+      "uzp1 z18.d, z16.d, z19.d\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "uzp2 z16.d, z16.d, z19.d\n"
+      "uzp1 z19.d, z20.d, z23.d\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "uzp2 z20.d, z20.d, z23.d\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "uzp1 z23.d, z21.d, z24.d\n"
+      "uzp2 z21.d, z21.d, z24.d\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "uzp1 z24.d, z22.d, z25.d\n"
+      "uzp2 z22.d, z22.d, z25.d\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "uzp1 z25.d, z26.d, z29.d\n"
+      "uzp2 z26.d, z26.d, z29.d\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "uzp1 z29.d, z27.d, z30.d\n"
+      "uzp2 z27.d, z27.d, z30.d\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "uzp1 z30.d, z28.d, z31.d\n"
+      "uzp2 z28.d, z28.d, z31.d\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
index 7d39485164..f66a9bf51e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,64 +10,103 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
 
 #ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include <cstdint>
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const uint8_t *, const uint8_t *, \
+    uint32_t *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void sve_interleaved_u8u32_dot_8x3VL( ARGLIST );
 
-class cls_sve_interleaved_u8u32_dot_8x3VL {
+class cls_sve_interleaved_u8u32_dot_8x3VL
+{
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
 
-    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
     static unsigned int out_width()
     {
         return get_vector_length<uint32_t>() * 3;
     }
 
-    static unsigned int out_height()
+    static unsigned int stripe_width()
     {
-        return 8;
+        return get_vector_length<uint32_t>();
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 4;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
     StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_u8u32_dot_8x3VL;
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.66, 4.11, 7.94 };
+                case CPUModel::A510:
+                    return { 27.44, 3.41, 2.90 };
+                case CPUModel::V1:
+                    return { 63.30, 4.97, 11.52 };
+            }
+        }
+
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 31.67, 4.04, 0.50 };
+                case CPUModel::A510:
+                    return { 27.45, 1.65, 0.28 };
+                case CPUModel::V1:
+                    return { 63.35, 4.96, 0.77 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_u8u32_dot_8x3VL;
     cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp
new file mode 100644
index 0000000000..1e2fb138fd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/a64fx.cpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef ARM_COMPUTE_ENABLE_SVE
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_interleaved_u8u32_dot_8x3VL_a64fx(
+    const uint8_t *Apanel, const uint8_t *Bpanel,
+    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+    } ka;
+
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
+
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "ld1b { z0.b }, p0/Z, [x20]\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "ld1b { z1.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "ld1b { z2.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "udot z8.s, z0.b, z3.b\n"
+      "udot z9.s, z1.b, z3.b\n"
+      "sub x19, x19, #0x2\n"
+      "udot z10.s, z2.b, z3.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "udot z11.s, z0.b, z4.b\n"
+      "udot z12.s, z1.b, z4.b\n"
+      "udot z13.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "udot z14.s, z0.b, z5.b\n"
+      "udot z15.s, z1.b, z5.b\n"
+      "cmp x19, #0x2\n"
+      "udot z16.s, z2.b, z5.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "udot z17.s, z0.b, z6.b\n"
+      "udot z18.s, z1.b, z6.b\n"
+      "udot z19.s, z2.b, z6.b\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
+      "udot z20.s, z0.b, z3.b\n"
+      "udot z21.s, z1.b, z3.b\n"
+      "udot z22.s, z2.b, z3.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #32]\n"
+      "udot z23.s, z0.b, z4.b\n"
+      "udot z24.s, z1.b, z4.b\n"
+      "udot z25.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #36]\n"
+      "udot z26.s, z0.b, z5.b\n"
+      "udot z27.s, z1.b, z5.b\n"
+      "udot z28.s, z2.b, z5.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #40]\n"
+      "udot z29.s, z0.b, z6.b\n"
+      "ld1b { z0.b }, p0/Z, [x20, #3, MUL VL]\n"
+      "udot z30.s, z1.b, z6.b\n"
+      "udot z31.s, z2.b, z6.b\n"
+      "ld1b { z1.b }, p0/Z, [x20, #4, MUL VL]\n"
+      "ld1b { z2.b }, p0/Z, [x20, #5, MUL VL]\n"
+      "udot z8.s, z0.b, z3.b\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #44]\n"
+      "udot z9.s, z1.b, z3.b\n"
+      "udot z10.s, z2.b, z3.b\n"
+      "udot z11.s, z0.b, z4.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #48]\n"
+      "udot z12.s, z1.b, z4.b\n"
+      "udot z13.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #52]\n"
+      "udot z14.s, z0.b, z5.b\n"
+      "udot z15.s, z1.b, z5.b\n"
+      "addvl x20, x20, #6\n"
+      "udot z16.s, z2.b, z5.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #56]\n"
+      "udot z17.s, z0.b, z6.b\n"
+      "udot z18.s, z1.b, z6.b\n"
+      "udot z19.s, z2.b, z6.b\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #60]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "udot z20.s, z0.b, z3.b\n"
+      "udot z21.s, z1.b, z3.b\n"
+      "udot z22.s, z2.b, z3.b\n"
+      "udot z23.s, z0.b, z4.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "udot z24.s, z1.b, z4.b\n"
+      "udot z25.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "udot z26.s, z0.b, z5.b\n"
+      "udot z27.s, z1.b, z5.b\n"
+      "udot z28.s, z2.b, z5.b\n"
+      "udot z29.s, z0.b, z6.b\n"
+      "ld1b { z0.b }, p0/Z, [x20]\n"
+      "udot z30.s, z1.b, z6.b\n"
+      "udot z31.s, z2.b, z6.b\n"
+      "ld1b { z1.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "ld1b { z2.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "udot z8.s, z0.b, z3.b\n"
+      "udot z9.s, z1.b, z3.b\n"
+      "addvl x20, x20, #3\n"
+      "udot z10.s, z2.b, z3.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "udot z11.s, z0.b, z4.b\n"
+      "udot z12.s, z1.b, z4.b\n"
+      "udot z13.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "udot z14.s, z0.b, z5.b\n"
+      "udot z15.s, z1.b, z5.b\n"
+      "udot z16.s, z2.b, z5.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "udot z17.s, z0.b, z6.b\n"
+      "udot z18.s, z1.b, z6.b\n"
+      "udot z19.s, z2.b, z6.b\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
+      "udot z20.s, z0.b, z3.b\n"
+      "udot z21.s, z1.b, z3.b\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "udot z22.s, z2.b, z3.b\n"
+      "udot z23.s, z0.b, z4.b\n"
+      "udot z24.s, z1.b, z4.b\n"
+      "udot z25.s, z2.b, z4.b\n"
+      "udot z26.s, z0.b, z5.b\n"
+      "udot z27.s, z1.b, z5.b\n"
+      "udot z28.s, z2.b, z5.b\n"
+      "udot z29.s, z0.b, z6.b\n"
+      "udot z30.s, z1.b, z6.b\n"
+      "udot z31.s, z2.b, z6.b\n"
+      "cbz x19, 5f\n"
+      "ld1b { z0.b }, p0/Z, [x20]\n"
+      "ld1b { z1.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "ld1b { z2.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel]]\n"
+      "udot z8.s, z0.b, z3.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #4]\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #8]\n"
+      "udot z9.s, z1.b, z3.b\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #12]\n"
+      "udot z10.s, z2.b, z3.b\n"
+      "udot z11.s, z0.b, z4.b\n"
+      "ld1rw { z3.s }, p0/Z, [%x[Apanel], #16]\n"
+      "udot z12.s, z1.b, z4.b\n"
+      "udot z13.s, z2.b, z4.b\n"
+      "ld1rw { z4.s }, p0/Z, [%x[Apanel], #20]\n"
+      "udot z14.s, z0.b, z5.b\n"
+      "udot z15.s, z1.b, z5.b\n"
+      "udot z16.s, z2.b, z5.b\n"
+      "udot z17.s, z0.b, z6.b\n"
+      "ld1rw { z5.s }, p0/Z, [%x[Apanel], #24]\n"
+      "udot z18.s, z1.b, z6.b\n"
+      "udot z19.s, z2.b, z6.b\n"
+      "ld1rw { z6.s }, p0/Z, [%x[Apanel], #28]\n"
+      "addvl x20, x20, #3\n"
+      "udot z20.s, z0.b, z3.b\n"
+      "udot z21.s, z1.b, z3.b\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "udot z22.s, z2.b, z3.b\n"
+      "udot z23.s, z0.b, z4.b\n"
+      "udot z24.s, z1.b, z4.b\n"
+      "udot z25.s, z2.b, z4.b\n"
+      "udot z26.s, z0.b, z5.b\n"
+      "udot z27.s, z1.b, z5.b\n"
+      "udot z28.s, z2.b, z5.b\n"
+      "udot z29.s, z0.b, z6.b\n"
+      "udot z30.s, z1.b, z6.b\n"
+      "udot z31.s, z2.b, z6.b\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x22, x22, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // namespace arm_gemm
+#endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
index 2139bab69d..f1642d0b21 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,320 +10,237 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #ifdef ARM_COMPUTE_ENABLE_SVE
 
+#include <cstddef>
 #include <cstdint>
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const uint8_t *a_ptr = Apanel;
-    uint32_t *c_ptr = Cpanel;
+void sve_interleaved_u8u32_dot_8x3VL(
+    const uint8_t *Apanel, const uint8_t *Bpanel,
+    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
 
-    K /= 4;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr = Bpanel;
+    ka.bblocks = bblocks;
+    ka.K = (K/4) - 1;
+    ka.Bpanel = Bpanel;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.b\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z17.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "mov z18.s, #0\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                "udot z8.s, z4.b, z0.b[0]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "udot z9.s, z4.b, z0.b[1]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                "udot z10.s, z4.b, z0.b[2]\n"
-                "subs %[loops], %[loops], #0x1\n"
-                "udot z11.s, z4.b, z0.b[3]\n"
-                "udot z20.s, z4.b, z1.b[0]\n"
-                "udot z21.s, z4.b, z1.b[1]\n"
-                "udot z22.s, z4.b, z1.b[2]\n"
-                "udot z23.s, z4.b, z1.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "udot z12.s, z5.b, z0.b[0]\n"
-                "udot z13.s, z5.b, z0.b[1]\n"
-                "udot z14.s, z5.b, z0.b[2]\n"
-                "udot z15.s, z5.b, z0.b[3]\n"
-                "udot z24.s, z5.b, z1.b[0]\n"
-                "udot z25.s, z5.b, z1.b[1]\n"
-                "udot z26.s, z5.b, z1.b[2]\n"
-                "udot z27.s, z5.b, z1.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "udot z16.s, z6.b, z0.b[0]\n"
-                "udot z17.s, z6.b, z0.b[1]\n"
-                "udot z18.s, z6.b, z0.b[2]\n"
-                "udot z19.s, z6.b, z0.b[3]\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "udot z28.s, z6.b, z1.b[0]\n"
-                "udot z29.s, z6.b, z1.b[1]\n"
-                "udot z30.s, z6.b, z1.b[2]\n"
-                "udot z31.s, z6.b, z1.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "udot z8.s, z4.b, z2.b[0]\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "udot z9.s, z4.b, z2.b[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "udot z10.s, z4.b, z2.b[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "udot z11.s, z4.b, z2.b[3]\n"
-                "udot z20.s, z4.b, z3.b[0]\n"
-                "udot z21.s, z4.b, z3.b[1]\n"
-                "udot z22.s, z4.b, z3.b[2]\n"
-                "udot z23.s, z4.b, z3.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "udot z12.s, z5.b, z2.b[0]\n"
-                "udot z13.s, z5.b, z2.b[1]\n"
-                "udot z14.s, z5.b, z2.b[2]\n"
-                "udot z15.s, z5.b, z2.b[3]\n"
-                "udot z24.s, z5.b, z3.b[0]\n"
-                "udot z25.s, z5.b, z3.b[1]\n"
-                "udot z26.s, z5.b, z3.b[2]\n"
-                "udot z27.s, z5.b, z3.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "udot z16.s, z6.b, z2.b[0]\n"
-                "udot z17.s, z6.b, z2.b[1]\n"
-                "udot z18.s, z6.b, z2.b[2]\n"
-                "udot z19.s, z6.b, z2.b[3]\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                "udot z28.s, z6.b, z3.b[0]\n"
-                "udot z29.s, z6.b, z3.b[1]\n"
-                "udot z30.s, z6.b, z3.b[2]\n"
-                "udot z31.s, z6.b, z3.b[3]\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                "udot z8.s, z4.b, z0.b[0]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "udot z9.s, z4.b, z0.b[1]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                "udot z10.s, z4.b, z0.b[2]\n"
-                "udot z11.s, z4.b, z0.b[3]\n"
-                "udot z20.s, z4.b, z1.b[0]\n"
-                "udot z21.s, z4.b, z1.b[1]\n"
-                "udot z22.s, z4.b, z1.b[2]\n"
-                "udot z23.s, z4.b, z1.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "udot z12.s, z5.b, z0.b[0]\n"
-                "udot z13.s, z5.b, z0.b[1]\n"
-                "udot z14.s, z5.b, z0.b[2]\n"
-                "udot z15.s, z5.b, z0.b[3]\n"
-                "udot z24.s, z5.b, z1.b[0]\n"
-                "udot z25.s, z5.b, z1.b[1]\n"
-                "udot z26.s, z5.b, z1.b[2]\n"
-                "udot z27.s, z5.b, z1.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "udot z16.s, z6.b, z0.b[0]\n"
-                "udot z17.s, z6.b, z0.b[1]\n"
-                "udot z18.s, z6.b, z0.b[2]\n"
-                "udot z19.s, z6.b, z0.b[3]\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "udot z28.s, z6.b, z1.b[0]\n"
-                "udot z29.s, z6.b, z1.b[1]\n"
-                "udot z30.s, z6.b, z1.b[2]\n"
-                "udot z31.s, z6.b, z1.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "udot z8.s, z4.b, z2.b[0]\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "udot z9.s, z4.b, z2.b[1]\n"
-                "add %[a_ptr], %[a_ptr], #0x20\n"
-                "udot z10.s, z4.b, z2.b[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #6\n"
-                "udot z11.s, z4.b, z2.b[3]\n"
-                "udot z20.s, z4.b, z3.b[0]\n"
-                "udot z21.s, z4.b, z3.b[1]\n"
-                "udot z22.s, z4.b, z3.b[2]\n"
-                "udot z23.s, z4.b, z3.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "udot z12.s, z5.b, z2.b[0]\n"
-                "udot z13.s, z5.b, z2.b[1]\n"
-                "udot z14.s, z5.b, z2.b[2]\n"
-                "udot z15.s, z5.b, z2.b[3]\n"
-                "udot z24.s, z5.b, z3.b[0]\n"
-                "udot z25.s, z5.b, z3.b[1]\n"
-                "udot z26.s, z5.b, z3.b[2]\n"
-                "udot z27.s, z5.b, z3.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "udot z16.s, z6.b, z2.b[0]\n"
-                "udot z17.s, z6.b, z2.b[1]\n"
-                "udot z18.s, z6.b, z2.b[2]\n"
-                "udot z19.s, z6.b, z2.b[3]\n"
-                "udot z28.s, z6.b, z3.b[0]\n"
-                "udot z29.s, z6.b, z3.b[1]\n"
-                "udot z30.s, z6.b, z3.b[2]\n"
-                "udot z31.s, z6.b, z3.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "udot z8.s, z4.b, z0.b[0]\n"
-                "udot z9.s, z4.b, z0.b[1]\n"
-                "udot z10.s, z4.b, z0.b[2]\n"
-                "udot z11.s, z4.b, z0.b[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                "udot z20.s, z4.b, z1.b[0]\n"
-                "udot z21.s, z4.b, z1.b[1]\n"
-                "udot z22.s, z4.b, z1.b[2]\n"
-                "udot z23.s, z4.b, z1.b[3]\n"
-                "udot z12.s, z5.b, z0.b[0]\n"
-                "udot z13.s, z5.b, z0.b[1]\n"
-                "udot z14.s, z5.b, z0.b[2]\n"
-                "udot z15.s, z5.b, z0.b[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "udot z24.s, z5.b, z1.b[0]\n"
-                "udot z25.s, z5.b, z1.b[1]\n"
-                "udot z26.s, z5.b, z1.b[2]\n"
-                "udot z27.s, z5.b, z1.b[3]\n"
-                "udot z16.s, z6.b, z0.b[0]\n"
-                "udot z17.s, z6.b, z0.b[1]\n"
-                "udot z18.s, z6.b, z0.b[2]\n"
-                "udot z19.s, z6.b, z0.b[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "udot z28.s, z6.b, z1.b[0]\n"
-                "udot z29.s, z6.b, z1.b[1]\n"
-                "udot z30.s, z6.b, z1.b[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "udot z31.s, z6.b, z1.b[3]\n"
-                "b 4f\n"
-                "3:\n"
-                "udot z8.s, z4.b, z0.b[0]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "udot z9.s, z4.b, z0.b[1]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                "udot z10.s, z4.b, z0.b[2]\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
-                "udot z11.s, z4.b, z0.b[3]\n"
-                "udot z20.s, z4.b, z1.b[0]\n"
-                "udot z21.s, z4.b, z1.b[1]\n"
-                "udot z22.s, z4.b, z1.b[2]\n"
-                "udot z23.s, z4.b, z1.b[3]\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                "udot z12.s, z5.b, z0.b[0]\n"
-                "udot z13.s, z5.b, z0.b[1]\n"
-                "udot z14.s, z5.b, z0.b[2]\n"
-                "udot z15.s, z5.b, z0.b[3]\n"
-                "udot z24.s, z5.b, z1.b[0]\n"
-                "udot z25.s, z5.b, z1.b[1]\n"
-                "udot z26.s, z5.b, z1.b[2]\n"
-                "udot z27.s, z5.b, z1.b[3]\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                "udot z16.s, z6.b, z0.b[0]\n"
-                "udot z17.s, z6.b, z0.b[1]\n"
-                "udot z18.s, z6.b, z0.b[2]\n"
-                "udot z19.s, z6.b, z0.b[3]\n"
-                "udot z28.s, z6.b, z1.b[0]\n"
-                "udot z29.s, z6.b, z1.b[1]\n"
-                "udot z30.s, z6.b, z1.b[2]\n"
-                "udot z31.s, z6.b, z1.b[3]\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "udot z8.s, z4.b, z2.b[0]\n"
-                "udot z9.s, z4.b, z2.b[1]\n"
-                "udot z10.s, z4.b, z2.b[2]\n"
-                "udot z11.s, z4.b, z2.b[3]\n"
-                "st1w z8.s, p0, [%[c_ptr]]\n"
-                "udot z20.s, z4.b, z3.b[0]\n"
-                "udot z21.s, z4.b, z3.b[1]\n"
-                "udot z22.s, z4.b, z3.b[2]\n"
-                "udot z23.s, z4.b, z3.b[3]\n"
-                "udot z12.s, z5.b, z2.b[0]\n"
-                "udot z13.s, z5.b, z2.b[1]\n"
-                "udot z14.s, z5.b, z2.b[2]\n"
-                "udot z15.s, z5.b, z2.b[3]\n"
-                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "udot z24.s, z5.b, z3.b[0]\n"
-                "udot z25.s, z5.b, z3.b[1]\n"
-                "udot z26.s, z5.b, z3.b[2]\n"
-                "udot z27.s, z5.b, z3.b[3]\n"
-                "udot z16.s, z6.b, z2.b[0]\n"
-                "udot z17.s, z6.b, z2.b[1]\n"
-                "udot z18.s, z6.b, z2.b[2]\n"
-                "udot z19.s, z6.b, z2.b[3]\n"
-                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "udot z28.s, z6.b, z3.b[0]\n"
-                "udot z29.s, z6.b, z3.b[1]\n"
-                "udot z30.s, z6.b, z3.b[2]\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "udot z31.s, z6.b, z3.b[3]\n"
-                "4:\n"
-                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "st1w z25.s, p0, [%[c_ptr]]\n"
-                "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "ld1b { z4.b }, p0/Z, [x20]\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "mov %x[Apanel], x21\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "cmp x19, #0x2\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "udot z8.s, z4.b, z0.b[0]\n"
+      "udot z11.s, z4.b, z0.b[1]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "udot z14.s, z4.b, z0.b[2]\n"
+      "udot z17.s, z4.b, z0.b[3]\n"
+      "ld1b { z6.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "udot z20.s, z4.b, z1.b[0]\n"
+      "udot z23.s, z4.b, z1.b[1]\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
+      "udot z26.s, z4.b, z1.b[2]\n"
+      "udot z29.s, z4.b, z1.b[3]\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n"
+      "udot z9.s, z5.b, z0.b[0]\n"
+      "udot z12.s, z5.b, z0.b[1]\n"
+      "ld1b { z4.b }, p0/Z, [x20, #3, MUL VL]\n"
+      "udot z15.s, z5.b, z0.b[2]\n"
+      "udot z18.s, z5.b, z0.b[3]\n"
+      "sub x19, x19, #0x2\n"
+      "udot z21.s, z5.b, z1.b[0]\n"
+      "udot z24.s, z5.b, z1.b[1]\n"
+      "cmp x19, #0x2\n"
+      "udot z27.s, z5.b, z1.b[2]\n"
+      "udot z30.s, z5.b, z1.b[3]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #4, MUL VL]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z13.s, z6.b, z0.b[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      "udot z16.s, z6.b, z0.b[2]\n"
+      "udot z19.s, z6.b, z0.b[3]\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "udot z22.s, z6.b, z1.b[0]\n"
+      "udot z25.s, z6.b, z1.b[1]\n"
+      "udot z28.s, z6.b, z1.b[2]\n"
+      "udot z31.s, z6.b, z1.b[3]\n"
+      "ld1b { z6.b }, p0/Z, [x20, #5, MUL VL]\n"
+      "udot z8.s, z4.b, z2.b[0]\n"
+      "udot z11.s, z4.b, z2.b[1]\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "udot z14.s, z4.b, z2.b[2]\n"
+      "udot z17.s, z4.b, z2.b[3]\n"
+      "addvl x20, x20, #6\n"
+      "udot z20.s, z4.b, z3.b[0]\n"
+      "udot z23.s, z4.b, z3.b[1]\n"
+      "udot z26.s, z4.b, z3.b[2]\n"
+      "udot z29.s, z4.b, z3.b[3]\n"
+      "udot z9.s, z5.b, z2.b[0]\n"
+      "udot z12.s, z5.b, z2.b[1]\n"
+      "ld1b { z4.b }, p0/Z, [x20]\n"
+      "udot z15.s, z5.b, z2.b[2]\n"
+      "udot z18.s, z5.b, z2.b[3]\n"
+      "udot z21.s, z5.b, z3.b[0]\n"
+      "udot z24.s, z5.b, z3.b[1]\n"
+      "udot z27.s, z5.b, z3.b[2]\n"
+      "udot z30.s, z5.b, z3.b[3]\n"
+      "udot z10.s, z6.b, z2.b[0]\n"
+      "udot z13.s, z6.b, z2.b[1]\n"
+      "udot z16.s, z6.b, z2.b[2]\n"
+      "udot z19.s, z6.b, z2.b[3]\n"
+      "udot z22.s, z6.b, z3.b[0]\n"
+      "udot z25.s, z6.b, z3.b[1]\n"
+      "udot z28.s, z6.b, z3.b[2]\n"
+      "udot z31.s, z6.b, z3.b[3]\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "udot z8.s, z4.b, z0.b[0]\n"
+      "udot z11.s, z4.b, z0.b[1]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "udot z14.s, z4.b, z0.b[2]\n"
+      "udot z17.s, z4.b, z0.b[3]\n"
+      "ld1b { z6.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "udot z20.s, z4.b, z1.b[0]\n"
+      "udot z23.s, z4.b, z1.b[1]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "udot z26.s, z4.b, z1.b[2]\n"
+      "udot z29.s, z4.b, z1.b[3]\n"
+      "addvl x20, x20, #3\n"
+      "udot z9.s, z5.b, z0.b[0]\n"
+      "udot z12.s, z5.b, z0.b[1]\n"
+      "udot z15.s, z5.b, z0.b[2]\n"
+      "udot z18.s, z5.b, z0.b[3]\n"
+      "udot z21.s, z5.b, z1.b[0]\n"
+      "udot z24.s, z5.b, z1.b[1]\n"
+      "udot z27.s, z5.b, z1.b[2]\n"
+      "udot z30.s, z5.b, z1.b[3]\n"
+      "udot z10.s, z6.b, z0.b[0]\n"
+      "udot z13.s, z6.b, z0.b[1]\n"
+      "udot z16.s, z6.b, z0.b[2]\n"
+      "udot z19.s, z6.b, z0.b[3]\n"
+      "udot z22.s, z6.b, z1.b[0]\n"
+      "udot z25.s, z6.b, z1.b[1]\n"
+      "udot z28.s, z6.b, z1.b[2]\n"
+      "udot z31.s, z6.b, z1.b[3]\n"
+      "cbz x19, 5f\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "add %x[Apanel], %x[Apanel], #0x20\n"
+      "ld1b { z7.b }, p0/Z, [x20]\n"
+      "ld1b { z4.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "addvl x20, x20, #3\n"
+      "udot z8.s, z7.b, z0.b[0]\n"
+      "udot z11.s, z7.b, z0.b[1]\n"
+      "udot z14.s, z7.b, z0.b[2]\n"
+      "udot z17.s, z7.b, z0.b[3]\n"
+      "udot z20.s, z7.b, z1.b[0]\n"
+      "udot z23.s, z7.b, z1.b[1]\n"
+      "udot z26.s, z7.b, z1.b[2]\n"
+      "udot z29.s, z7.b, z1.b[3]\n"
+      "udot z9.s, z4.b, z0.b[0]\n"
+      "udot z12.s, z4.b, z0.b[1]\n"
+      "udot z15.s, z4.b, z0.b[2]\n"
+      "udot z18.s, z4.b, z0.b[3]\n"
+      "udot z21.s, z4.b, z1.b[0]\n"
+      "udot z24.s, z4.b, z1.b[1]\n"
+      "udot z27.s, z4.b, z1.b[2]\n"
+      "udot z30.s, z4.b, z1.b[3]\n"
+      "udot z10.s, z5.b, z0.b[0]\n"
+      "udot z13.s, z5.b, z0.b[1]\n"
+      "udot z16.s, z5.b, z0.b[2]\n"
+      "udot z19.s, z5.b, z0.b[3]\n"
+      "udot z22.s, z5.b, z1.b[0]\n"
+      "udot z25.s, z5.b, z1.b[1]\n"
+      "udot z28.s, z5.b, z1.b[2]\n"
+      "udot z31.s, z5.b, z1.b[3]\n"
+      "5:"  // multiply loop done
+      "st1w { z8.s }, p0, [%x[Cpanel]]\n"
+      "subs x22, x22, #0x1\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z31.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
index ca9cadd6d7..b530202bd7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -10,64 +10,103 @@
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 #pragma once
 
 #ifdef ARM_COMPUTE_ENABLE_SVE
-
-#include <cstdint>
 #include "../std_transforms_sve.hpp"
+#include "../performance_parameters.hpp"
 
-namespace arm_gemm {
+#define ARGLIST  \
+    const uint8_t *, const uint8_t *, \
+    uint32_t *, int, int, int
 
+namespace arm_gemm
+{
 // Actual kernel implementations
-void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void sve_interleaved_u8u32_mmla_8x3VL( ARGLIST );
 
-class cls_sve_interleaved_u8u32_mmla_8x3VL {
+class cls_sve_interleaved_u8u32_mmla_8x3VL
+{
 public:
     typedef uint8_t operand_type;
     typedef uint32_t result_type;
 
-    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+    typedef void (*kern_type)( ARGLIST );
 
     /* Kernel blocking parameters */
+    static constexpr unsigned int out_height()
+    {
+        return 8;
+    }
+
     static unsigned int out_width()
     {
         return get_vector_length<uint32_t>() * 3;
     }
 
-    static unsigned int out_height()
+    static unsigned int stripe_width()
     {
-        return 8;
+        return get_vector_length<uint32_t>();
     }
 
-    static unsigned int k_unroll()
+    static constexpr unsigned int k_unroll()
     {
         return 8;
     }
 
-    // Use the standard fixed size transforms.
+
     StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
     StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
+    template<typename T>
+    static inline PerformanceParameters get_performance_parameters(const CPUInfo *ci)
+    {
 
-    kern_type kernel=sve_interleaved_u8u32_mmla_8x3VL;
+        if (std::is_same<T, uint32_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 61.97, 4.11, 7.93 };
+                case CPUModel::A510:
+                    return { 43.18, 3.57, 2.89 };
+                case CPUModel::V1:
+                    return { 123.47, 5.03, 11.76 };
+            }
+        }
+
+
+        if (std::is_same<T, uint8_t>::value) {
+            switch (ci->get_cpu_model()) {
+                default:
+                    return { 62.00, 4.08, 0.51 };
+                case CPUModel::A510:
+                    return { 38.02, 1.85, 0.28 };
+                case CPUModel::V1:
+                    return { 123.84, 4.98, 0.76 };
+            }
+        }
+
+        return { 1.0 };
+    }
 
+    // Default to the generic kernel
+    kern_type kernel=sve_interleaved_u8u32_mmla_8x3VL;
     cls_sve_interleaved_u8u32_mmla_8x3VL(const CPUInfo *)
     {
-
     }
 };
 
 } // namespace arm_gemm
 
+#undef ARGLIST
+
 #endif // ARM_COMPUTE_ENABLE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
index d42385789c..c4fdfa6abc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,375 +23,271 @@
  */
 #ifdef ARM_COMPUTE_ENABLE_SVE
 
+#include <cstddef>
 #include <cstdint>
-#include "../../asmlib.hpp"
 
 namespace arm_gemm {
 
-void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
-    const uint8_t *a_ptr = Apanel;
-    uint32_t *c_ptr = Cpanel;
+void sve_interleaved_u8u32_mmla_8x3VL(
+    const uint8_t *Apanel, const uint8_t *Bpanel,
+    uint32_t *Cpanel, int ablocks, int bblocks, int K) {
 
-    K /= 8;
-    const long loops_count = (K / 2) - 1;
-    const long tails_count = K % 2;
+    struct KernelArgs {
+        size_t bblocks = {};
+        size_t K = {};
+        const uint8_t *Bpanel = {};
+    } ka;
 
-    for (int yb=0; yb<ablocks; yb++) {
-        const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr = Bpanel;
+    ka.bblocks = bblocks;
+    ka.K = (K/8) - 1;
+    ka.Bpanel = Bpanel;
 
-        for (int xb=0; xb<bblocks; xb++) {
-            a_ptr = a_ptr0;
-            long loops = loops_count;
-            long tails = tails_count;
-
-            __asm __volatile (
-                "mov z8.s, #0\n"
-                "ptrue p0.b\n"
-                "mov z9.s, #0\n"
-                "mov z10.s, #0\n"
-                "mov z11.s, #0\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "mov z12.s, #0\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "mov z13.s, #0\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z14.s, #0\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.s, #0\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z16.s, #0\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "mov z17.s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z18.s, #0\n"
-                "addvl %[b_ptr], %[b_ptr], #4\n"
-                "mov z19.s, #0\n"
-                "mov z20.s, #0\n"
-                "mov z21.s, #0\n"
-                "mov z22.s, #0\n"
-                "mov z23.s, #0\n"
-                "mov z24.s, #0\n"
-                "mov z25.s, #0\n"
-                "mov z26.s, #0\n"
-                "mov z27.s, #0\n"
-                "mov z28.s, #0\n"
-                "mov z29.s, #0\n"
-                "mov z30.s, #0\n"
-                "mov z31.s, #0\n"
-                "cbz %[loops], 1f\n"
-                "2:\n"
-                ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
-                "subs %[loops], %[loops], #0x1\n"
-                ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
-                ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
-                ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
-                ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n"
-                ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
-                ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
-                ".inst 0x45c6987c // ummla z28.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
-                ".inst 0x45c79831 // ummla z17.s, z1.b, z7.b\n"
-                ".inst 0x45c79857 // ummla z23.s, z2.b, z7.b\n"
-                ".inst 0x45c7987d // ummla z29.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                ".inst 0x45c4980c // ummla z12.s, z0.b, z4.b\n"
-                ".inst 0x45c49832 // ummla z18.s, z1.b, z4.b\n"
-                ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
-                ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                ".inst 0x45c5980d // ummla z13.s, z0.b, z5.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                ".inst 0x45c5987f // ummla z31.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #5, MUL VL]\n"
-                ".inst 0x45c69808 // ummla z8.s, z0.b, z6.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n"
-                ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
-                "addvl %[b_ptr], %[b_ptr], #12\n"
-                ".inst 0x45c6987a // ummla z26.s, z3.b, z6.b\n"
-                ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
-                ".inst 0x45c7982f // ummla z15.s, z1.b, z7.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x45c79855 // ummla z21.s, z2.b, z7.b\n"
-                ".inst 0x45c7987b // ummla z27.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x45c4980a // ummla z10.s, z0.b, z4.b\n"
-                ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
-                ".inst 0x45c49856 // ummla z22.s, z2.b, z4.b\n"
-                ".inst 0x45c4987c // ummla z28.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x45c5980b // ummla z11.s, z0.b, z5.b\n"
-                ".inst 0x45c59831 // ummla z17.s, z1.b, z5.b\n"
-                ".inst 0x45c59857 // ummla z23.s, z2.b, z5.b\n"
-                ".inst 0x45c5987d // ummla z29.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
-                ".inst 0x45c69832 // ummla z18.s, z1.b, z6.b\n"
-                ".inst 0x45c69858 // ummla z24.s, z2.b, z6.b\n"
-                ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x45c7980d // ummla z13.s, z0.b, z7.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x45c79833 // ummla z19.s, z1.b, z7.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x45c7987f // ummla z31.s, z3.b, z7.b\n"
-                "b.ne 2b\n"
-                "1:\n"
-                "cbz %[tails], 3f\n"
-                ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
-                ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
-                ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
-                ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
-                ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n"
-                ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
-                ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
-                ".inst 0x45c6987c // ummla z28.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
-                ".inst 0x45c79831 // ummla z17.s, z1.b, z7.b\n"
-                ".inst 0x45c79857 // ummla z23.s, z2.b, z7.b\n"
-                ".inst 0x45c7987d // ummla z29.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                ".inst 0x45c4980c // ummla z12.s, z0.b, z4.b\n"
-                ".inst 0x45c49832 // ummla z18.s, z1.b, z4.b\n"
-                ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
-                ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                ".inst 0x45c5980d // ummla z13.s, z0.b, z5.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                ".inst 0x45c5987f // ummla z31.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #5, MUL VL]\n"
-                ".inst 0x45c69808 // ummla z8.s, z0.b, z6.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n"
-                ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
-                "add %[a_ptr], %[a_ptr], #0x80\n"
-                ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
-                "addvl %[b_ptr], %[b_ptr], #14\n"
-                ".inst 0x45c6987a // ummla z26.s, z3.b, z6.b\n"
-                ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
-                ".inst 0x45c7982f // ummla z15.s, z1.b, z7.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
-                ".inst 0x45c79855 // ummla z21.s, z2.b, z7.b\n"
-                ".inst 0x45c7987b // ummla z27.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-7, MUL VL]\n"
-                ".inst 0x45c4980a // ummla z10.s, z0.b, z4.b\n"
-                ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
-                ".inst 0x45c49856 // ummla z22.s, z2.b, z4.b\n"
-                ".inst 0x45c4987c // ummla z28.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x45c5980b // ummla z11.s, z0.b, z5.b\n"
-                ".inst 0x45c59831 // ummla z17.s, z1.b, z5.b\n"
-                ".inst 0x45c59857 // ummla z23.s, z2.b, z5.b\n"
-                ".inst 0x45c5987d // ummla z29.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
-                ".inst 0x45c69832 // ummla z18.s, z1.b, z6.b\n"
-                ".inst 0x45c69858 // ummla z24.s, z2.b, z6.b\n"
-                ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x45c7980d // ummla z13.s, z0.b, z7.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x45c79833 // ummla z19.s, z1.b, z7.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x45c7987f // ummla z31.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
-                ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
-                ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
-                ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
-                ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
-                ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n"
-                ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
-                ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
-                ".inst 0x45c6987c // ummla z28.s, z3.b, z6.b\n"
-                "uzp1 z6.d, z14.d, z15.d\n"
-                ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
-                ".inst 0x45c79831 // ummla z17.s, z1.b, z7.b\n"
-                ".inst 0x45c79857 // ummla z23.s, z2.b, z7.b\n"
-                ".inst 0x45c7987d // ummla z29.s, z3.b, z7.b\n"
-                ".inst 0x45c4980c // ummla z12.s, z0.b, z4.b\n"
-                "uzp1 z7.d, z16.d, z17.d\n"
-                ".inst 0x45c49832 // ummla z18.s, z1.b, z4.b\n"
-                ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
-                ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n"
-                "uzp2 z4.d, z10.d, z11.d\n"
-                ".inst 0x45c5980d // ummla z13.s, z0.b, z5.b\n"
-                "uzp1 z0.d, z8.d, z9.d\n"
-                ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
-                "uzp1 z1.d, z10.d, z11.d\n"
-                ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                "uzp1 z2.d, z12.d, z13.d\n"
-                "uzp1 z0.d, z18.d, z19.d\n"
-                ".inst 0x45c5987f // ummla z31.s, z3.b, z5.b\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "uzp2 z3.d, z8.d, z9.d\n"
-                "uzp2 z5.d, z12.d, z13.d\n"
-                "uzp2 z1.d, z14.d, z15.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "b 4f\n"
-                "3:\n"
-                ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
-                ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
-                "addvl %[b_ptr], %[b_ptr], #8\n"
-                ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
-                ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
-                ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
-                ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-7, MUL VL]\n"
-                ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n"
-                ".inst 0x45c69830 // ummla z16.s, z1.b, z6.b\n"
-                ".inst 0x45c69856 // ummla z22.s, z2.b, z6.b\n"
-                ".inst 0x45c6987c // ummla z28.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-6, MUL VL]\n"
-                ".inst 0x45c7980b // ummla z11.s, z0.b, z7.b\n"
-                ".inst 0x45c79831 // ummla z17.s, z1.b, z7.b\n"
-                ".inst 0x45c79857 // ummla z23.s, z2.b, z7.b\n"
-                ".inst 0x45c7987d // ummla z29.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-5, MUL VL]\n"
-                ".inst 0x45c4980c // ummla z12.s, z0.b, z4.b\n"
-                ".inst 0x45c49832 // ummla z18.s, z1.b, z4.b\n"
-                ".inst 0x45c49858 // ummla z24.s, z2.b, z4.b\n"
-                ".inst 0x45c4987e // ummla z30.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-4, MUL VL]\n"
-                ".inst 0x45c5980d // ummla z13.s, z0.b, z5.b\n"
-                "ld1rqb z0.b, p0/z, [%[a_ptr], #-0x40]\n"
-                ".inst 0x45c59833 // ummla z19.s, z1.b, z5.b\n"
-                "ld1rqb z1.b, p0/z, [%[a_ptr], #-0x30]\n"
-                ".inst 0x45c59859 // ummla z25.s, z2.b, z5.b\n"
-                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
-                ".inst 0x45c5987f // ummla z31.s, z3.b, z5.b\n"
-                "ld1b z5.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
-                ".inst 0x45c69808 // ummla z8.s, z0.b, z6.b\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
-                ".inst 0x45c6982e // ummla z14.s, z1.b, z6.b\n"
-                ".inst 0x45c69854 // ummla z20.s, z2.b, z6.b\n"
-                ".inst 0x45c79809 // ummla z9.s, z0.b, z7.b\n"
-                ".inst 0x45c6987a // ummla z26.s, z3.b, z6.b\n"
-                "ld1b z6.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
-                ".inst 0x45c7982f // ummla z15.s, z1.b, z7.b\n"
-                ".inst 0x45c79855 // ummla z21.s, z2.b, z7.b\n"
-                ".inst 0x45c7987b // ummla z27.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                ".inst 0x45c4980a // ummla z10.s, z0.b, z4.b\n"
-                ".inst 0x45c49830 // ummla z16.s, z1.b, z4.b\n"
-                ".inst 0x45c49856 // ummla z22.s, z2.b, z4.b\n"
-                ".inst 0x45c4987c // ummla z28.s, z3.b, z4.b\n"
-                ".inst 0x45c5980b // ummla z11.s, z0.b, z5.b\n"
-                ".inst 0x45c59831 // ummla z17.s, z1.b, z5.b\n"
-                ".inst 0x45c59857 // ummla z23.s, z2.b, z5.b\n"
-                ".inst 0x45c5987d // ummla z29.s, z3.b, z5.b\n"
-                "uzp2 z4.d, z10.d, z11.d\n"
-                ".inst 0x45c6980c // ummla z12.s, z0.b, z6.b\n"
-                ".inst 0x45c69832 // ummla z18.s, z1.b, z6.b\n"
-                ".inst 0x45c69858 // ummla z24.s, z2.b, z6.b\n"
-                ".inst 0x45c6987e // ummla z30.s, z3.b, z6.b\n"
-                "uzp1 z6.d, z14.d, z15.d\n"
-                ".inst 0x45c7980d // ummla z13.s, z0.b, z7.b\n"
-                "uzp1 z0.d, z8.d, z9.d\n"
-                ".inst 0x45c79833 // ummla z19.s, z1.b, z7.b\n"
-                "uzp1 z1.d, z10.d, z11.d\n"
-                "uzp2 z5.d, z12.d, z13.d\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
-                "uzp1 z2.d, z12.d, z13.d\n"
-                "uzp1 z0.d, z18.d, z19.d\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "uzp2 z1.d, z14.d, z15.d\n"
-                ".inst 0x45c7987f // ummla z31.s, z3.b, z7.b\n"
-                "uzp2 z3.d, z8.d, z9.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "uzp1 z7.d, z16.d, z17.d\n"
-                "4:\n"
-                "uzp2 z2.d, z16.d, z17.d\n"
-                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "uzp2 z3.d, z18.d, z19.d\n"
-                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "uzp1 z4.d, z20.d, z21.d\n"
-                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "uzp1 z5.d, z22.d, z23.d\n"
-                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "uzp1 z6.d, z24.d, z25.d\n"
-                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #16\n"
-                "uzp2 z7.d, z20.d, z21.d\n"
-                "st1w z0.s, p0, [%[c_ptr], #-8, MUL VL]\n"
-                "uzp2 z0.d, z22.d, z23.d\n"
-                "st1w z1.s, p0, [%[c_ptr], #-7, MUL VL]\n"
-                "uzp2 z1.d, z24.d, z25.d\n"
-                "st1w z2.s, p0, [%[c_ptr], #-6, MUL VL]\n"
-                "uzp1 z2.d, z26.d, z27.d\n"
-                "st1w z3.s, p0, [%[c_ptr], #-5, MUL VL]\n"
-                "uzp1 z3.d, z28.d, z29.d\n"
-                "st1w z4.s, p0, [%[c_ptr], #-4, MUL VL]\n"
-                "uzp1 z4.d, z30.d, z31.d\n"
-                "st1w z5.s, p0, [%[c_ptr], #-3, MUL VL]\n"
-                "uzp2 z5.d, z26.d, z27.d\n"
-                "st1w z6.s, p0, [%[c_ptr], #-2, MUL VL]\n"
-                "uzp2 z6.d, z28.d, z29.d\n"
-                "st1w z7.s, p0, [%[c_ptr], #-1, MUL VL]\n"
-                "uzp2 z7.d, z30.d, z31.d\n"
-                "st1w z0.s, p0, [%[c_ptr]]\n"
-                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
-                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
-                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
-                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
-                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
-                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
-                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
-                "addvl %[c_ptr], %[c_ptr], #8\n"
-            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
-              [loops] "+r" (loops), [tails] "+r" (tails)
-            :
-            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-            );
-        }
-    }
+    __asm__ __volatile__(
+      "ptrue p0.b\n"
+      "1:"  // Height loop
+      "ldr x22, [%x[args_ptr], %[offsetof_bblocks]]\n"
+      "mov x21, %x[Apanel]\n"
+      "ldr x20, [%x[args_ptr], %[offsetof_Bpanel]]\n"
+      "2:"  // Width loop
+      "ldr x19, [%x[args_ptr], %[offsetof_K]]\n"
+      "mov %x[Apanel], x21\n"
+      "cmp x19, #0x2\n"
+      "mov z8.s, #0x0\n"
+      "mov z9.s, #0x0\n"
+      "ld1b { z4.b }, p0/Z, [x20]\n"
+      "mov z10.s, #0x0\n"
+      "mov z11.s, #0x0\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      "mov z12.s, #0x0\n"
+      "mov z13.s, #0x0\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "mov z14.s, #0x0\n"
+      "mov z15.s, #0x0\n"
+      "ld1b { z5.b }, p0/Z, [x20, #1, MUL VL]\n"
+      "mov z16.s, #0x0\n"
+      "mov z17.s, #0x0\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
+      "mov z18.s, #0x0\n"
+      "mov z19.s, #0x0\n"
+      "addvl x20, x20, #2\n"
+      "mov z20.s, #0x0\n"
+      "mov z21.s, #0x0\n"
+      "add %x[Apanel], %x[Apanel], #0x30\n"
+      "mov z22.s, #0x0\n"
+      "mov z23.s, #0x0\n"
+      "mov z24.s, #0x0\n"
+      "mov z25.s, #0x0\n"
+      "mov z26.s, #0x0\n"
+      "mov z27.s, #0x0\n"
+      "mov z28.s, #0x0\n"
+      "mov z29.s, #0x0\n"
+      "mov z30.s, #0x0\n"
+      "mov z31.s, #0x0\n"
+      "blt 4f\n"
+      "3:"  // main loop head
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x45c49808  // ummla z8.s, z0.b, z4.b\n"
+      ".inst 0x45c4982e  // ummla z14.s, z1.b, z4.b\n"
+      ".inst 0x45c5980b  // ummla z11.s, z0.b, z5.b\n"
+      ".inst 0x45c59831  // ummla z17.s, z1.b, z5.b\n"
+      "ld1b { z6.b }, p0/Z, [x20]\n"
+      ".inst 0x45c49854  // ummla z20.s, z2.b, z4.b\n"
+      ".inst 0x45c59857  // ummla z23.s, z2.b, z5.b\n"
+      "ld1b { z7.b }, p0/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x45c4987a  // ummla z26.s, z3.b, z4.b\n"
+      ".inst 0x45c5987d  // ummla z29.s, z3.b, z5.b\n"
+      "ld1b { z4.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #3, MUL VL]\n"
+      ".inst 0x45c69809  // ummla z9.s, z0.b, z6.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
+      ".inst 0x45c6987b  // ummla z27.s, z3.b, z6.b\n"
+      "ld1b { z6.b }, p0/Z, [x20, #4, MUL VL]\n"
+      ".inst 0x45c7980c  // ummla z12.s, z0.b, z7.b\n"
+      ".inst 0x45c4980a  // ummla z10.s, z0.b, z4.b\n"
+      "sub x19, x19, #0x2\n"
+      ".inst 0x45c5980d  // ummla z13.s, z0.b, z5.b\n"
+      ".inst 0x45c79832  // ummla z18.s, z1.b, z7.b\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #16]\n"
+      ".inst 0x45c49830  // ummla z16.s, z1.b, z4.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #32]\n"
+      ".inst 0x45c79858  // ummla z24.s, z2.b, z7.b\n"
+      ".inst 0x45c7987e  // ummla z30.s, z3.b, z7.b\n"
+      "ld1b { z7.b }, p0/Z, [x20, #5, MUL VL]\n"
+      ".inst 0x45c49856  // ummla z22.s, z2.b, z4.b\n"
+      ".inst 0x45c59859  // ummla z25.s, z2.b, z5.b\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x45c4987c  // ummla z28.s, z3.b, z4.b\n"
+      ".inst 0x45c5987f  // ummla z31.s, z3.b, z5.b\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #64]\n"
+      "ld1b { z4.b }, p0/Z, [x20, #6, MUL VL]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      ".inst 0x45c69808  // ummla z8.s, z0.b, z6.b\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      "cmp x19, #0x2\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c79831  // ummla z17.s, z1.b, z7.b\n"
+      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
+      ".inst 0x45c79857  // ummla z23.s, z2.b, z7.b\n"
+      ".inst 0x45c6987a  // ummla z26.s, z3.b, z6.b\n"
+      ".inst 0x45c7987d  // ummla z29.s, z3.b, z7.b\n"
+      "ld1b { z6.b }, p0/Z, [x20, #-8, MUL VL]\n"
+      "ld1b { z7.b }, p0/Z, [x20, #-7, MUL VL]\n"
+      ".inst 0x45c49809  // ummla z9.s, z0.b, z4.b\n"
+      ".inst 0x45c4982f  // ummla z15.s, z1.b, z4.b\n"
+      ".inst 0x45c49855  // ummla z21.s, z2.b, z4.b\n"
+      ".inst 0x45c4987b  // ummla z27.s, z3.b, z4.b\n"
+      "ld1b { z4.b }, p0/Z, [x20, #-6, MUL VL]\n"
+      ".inst 0x45c5980c  // ummla z12.s, z0.b, z5.b\n"
+      ".inst 0x45c6980a  // ummla z10.s, z0.b, z6.b\n"
+      ".inst 0x45c7980d  // ummla z13.s, z0.b, z7.b\n"
+      ".inst 0x45c59832  // ummla z18.s, z1.b, z5.b\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel], #80]\n"
+      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
+      ".inst 0x45c79833  // ummla z19.s, z1.b, z7.b\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #96]\n"
+      ".inst 0x45c59858  // ummla z24.s, z2.b, z5.b\n"
+      ".inst 0x45c5987e  // ummla z30.s, z3.b, z5.b\n"
+      "ld1b { z5.b }, p0/Z, [x20, #-5, MUL VL]\n"
+      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
+      ".inst 0x45c79859  // ummla z25.s, z2.b, z7.b\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #112]\n"
+      ".inst 0x45c6987c  // ummla z28.s, z3.b, z6.b\n"
+      ".inst 0x45c7987f  // ummla z31.s, z3.b, z7.b\n"
+      "add %x[Apanel], %x[Apanel], #0x80\n"
+      "addvl x20, x20, #-4\n"
+      "bge 3b\n"
+      "4:"  // main loop skip
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x45c49808  // ummla z8.s, z0.b, z4.b\n"
+      ".inst 0x45c4982e  // ummla z14.s, z1.b, z4.b\n"
+      ".inst 0x45c5980b  // ummla z11.s, z0.b, z5.b\n"
+      ".inst 0x45c59831  // ummla z17.s, z1.b, z5.b\n"
+      "ld1b { z6.b }, p0/Z, [x20]\n"
+      ".inst 0x45c49854  // ummla z20.s, z2.b, z4.b\n"
+      ".inst 0x45c59857  // ummla z23.s, z2.b, z5.b\n"
+      "ld1b { z7.b }, p0/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x45c4987a  // ummla z26.s, z3.b, z4.b\n"
+      ".inst 0x45c5987d  // ummla z29.s, z3.b, z5.b\n"
+      "ld1b { z4.b }, p0/Z, [x20, #2, MUL VL]\n"
+      "ld1b { z5.b }, p0/Z, [x20, #3, MUL VL]\n"
+      ".inst 0x45c69809  // ummla z9.s, z0.b, z6.b\n"
+      ".inst 0x45c6982f  // ummla z15.s, z1.b, z6.b\n"
+      ".inst 0x45c69855  // ummla z21.s, z2.b, z6.b\n"
+      ".inst 0x45c6987b  // ummla z27.s, z3.b, z6.b\n"
+      "add %x[Apanel], %x[Apanel], #0x10\n"
+      ".inst 0x45c7980c  // ummla z12.s, z0.b, z7.b\n"
+      ".inst 0x45c4980a  // ummla z10.s, z0.b, z4.b\n"
+      "addvl x20, x20, #4\n"
+      ".inst 0x45c5980d  // ummla z13.s, z0.b, z5.b\n"
+      ".inst 0x45c79832  // ummla z18.s, z1.b, z7.b\n"
+      ".inst 0x45c49830  // ummla z16.s, z1.b, z4.b\n"
+      ".inst 0x45c59833  // ummla z19.s, z1.b, z5.b\n"
+      ".inst 0x45c79858  // ummla z24.s, z2.b, z7.b\n"
+      ".inst 0x45c7987e  // ummla z30.s, z3.b, z7.b\n"
+      ".inst 0x45c49856  // ummla z22.s, z2.b, z4.b\n"
+      ".inst 0x45c59859  // ummla z25.s, z2.b, z5.b\n"
+      ".inst 0x45c4987c  // ummla z28.s, z3.b, z4.b\n"
+      ".inst 0x45c5987f  // ummla z31.s, z3.b, z5.b\n"
+      "cbz x19, 5f\n"
+      "ld1b { z6.b }, p0/Z, [x20]\n"
+      "ld1rqb { z0.b }, p0/Z, [%x[Apanel]]\n"
+      ".inst 0x45c69808  // ummla z8.s, z0.b, z6.b\n"
+      "ld1rqb { z1.b }, p0/Z, [%x[Apanel], #16]\n"
+      "ld1b { z7.b }, p0/Z, [x20, #1, MUL VL]\n"
+      ".inst 0x45c6982e  // ummla z14.s, z1.b, z6.b\n"
+      "ld1rqb { z2.b }, p0/Z, [%x[Apanel], #32]\n"
+      "ld1rqb { z3.b }, p0/Z, [%x[Apanel], #48]\n"
+      ".inst 0x45c7980b  // ummla z11.s, z0.b, z7.b\n"
+      ".inst 0x45c79831  // ummla z17.s, z1.b, z7.b\n"
+      ".inst 0x45c69854  // ummla z20.s, z2.b, z6.b\n"
+      "ld1b { z4.b }, p0/Z, [x20, #2, MUL VL]\n"
+      ".inst 0x45c79857  // ummla z23.s, z2.b, z7.b\n"
+      ".inst 0x45c6987a  // ummla z26.s, z3.b, z6.b\n"
+      "ld1b { z5.b }, p0/Z, [x20, #3, MUL VL]\n"
+      ".inst 0x45c7987d  // ummla z29.s, z3.b, z7.b\n"
+      "ld1b { z6.b }, p0/Z, [x20, #4, MUL VL]\n"
+      "ld1b { z7.b }, p0/Z, [x20, #5, MUL VL]\n"
+      ".inst 0x45c49809  // ummla z9.s, z0.b, z4.b\n"
+      ".inst 0x45c4982f  // ummla z15.s, z1.b, z4.b\n"
+      "add %x[Apanel], %x[Apanel], #0x40\n"
+      ".inst 0x45c49855  // ummla z21.s, z2.b, z4.b\n"
+      ".inst 0x45c4987b  // ummla z27.s, z3.b, z4.b\n"
+      "addvl x20, x20, #6\n"
+      ".inst 0x45c5980c  // ummla z12.s, z0.b, z5.b\n"
+      ".inst 0x45c6980a  // ummla z10.s, z0.b, z6.b\n"
+      ".inst 0x45c7980d  // ummla z13.s, z0.b, z7.b\n"
+      ".inst 0x45c59832  // ummla z18.s, z1.b, z5.b\n"
+      ".inst 0x45c69830  // ummla z16.s, z1.b, z6.b\n"
+      ".inst 0x45c79833  // ummla z19.s, z1.b, z7.b\n"
+      ".inst 0x45c59858  // ummla z24.s, z2.b, z5.b\n"
+      ".inst 0x45c5987e  // ummla z30.s, z3.b, z5.b\n"
+      ".inst 0x45c69856  // ummla z22.s, z2.b, z6.b\n"
+      ".inst 0x45c79859  // ummla z25.s, z2.b, z7.b\n"
+      ".inst 0x45c6987c  // ummla z28.s, z3.b, z6.b\n"
+      ".inst 0x45c7987f  // ummla z31.s, z3.b, z7.b\n"
+      "5:"  // multiply loop done
+      "uzp1 z4.d, z8.d, z11.d\n"
+      "uzp2 z8.d, z8.d, z11.d\n"
+      "st1w { z4.s }, p0, [%x[Cpanel]]\n"
+      "uzp1 z11.d, z9.d, z12.d\n"
+      "uzp2 z9.d, z9.d, z12.d\n"
+      "st1w { z11.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "uzp1 z12.d, z10.d, z13.d\n"
+      "uzp2 z10.d, z10.d, z13.d\n"
+      "st1w { z12.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z8.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "uzp1 z13.d, z14.d, z17.d\n"
+      "uzp2 z14.d, z14.d, z17.d\n"
+      "st1w { z9.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "uzp1 z17.d, z15.d, z18.d\n"
+      "subs x22, x22, #0x1\n"
+      "st1w { z10.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "uzp2 z15.d, z15.d, z18.d\n"
+      "uzp1 z18.d, z16.d, z19.d\n"
+      "st1w { z13.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "uzp2 z16.d, z16.d, z19.d\n"
+      "uzp1 z19.d, z20.d, z23.d\n"
+      "st1w { z17.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #16\n"
+      "uzp2 z20.d, z20.d, z23.d\n"
+      "st1w { z18.s }, p0, [%x[Cpanel], #-8, MUL VL]\n"
+      "uzp1 z23.d, z21.d, z24.d\n"
+      "uzp2 z21.d, z21.d, z24.d\n"
+      "st1w { z14.s }, p0, [%x[Cpanel], #-7, MUL VL]\n"
+      "uzp1 z24.d, z22.d, z25.d\n"
+      "uzp2 z22.d, z22.d, z25.d\n"
+      "st1w { z15.s }, p0, [%x[Cpanel], #-6, MUL VL]\n"
+      "uzp1 z25.d, z26.d, z29.d\n"
+      "uzp2 z26.d, z26.d, z29.d\n"
+      "st1w { z16.s }, p0, [%x[Cpanel], #-5, MUL VL]\n"
+      "uzp1 z29.d, z27.d, z30.d\n"
+      "uzp2 z27.d, z27.d, z30.d\n"
+      "st1w { z19.s }, p0, [%x[Cpanel], #-4, MUL VL]\n"
+      "uzp1 z30.d, z28.d, z31.d\n"
+      "uzp2 z28.d, z28.d, z31.d\n"
+      "st1w { z23.s }, p0, [%x[Cpanel], #-3, MUL VL]\n"
+      "st1w { z24.s }, p0, [%x[Cpanel], #-2, MUL VL]\n"
+      "st1w { z20.s }, p0, [%x[Cpanel], #-1, MUL VL]\n"
+      "st1w { z21.s }, p0, [%x[Cpanel]]\n"
+      "st1w { z22.s }, p0, [%x[Cpanel], #1, MUL VL]\n"
+      "st1w { z25.s }, p0, [%x[Cpanel], #2, MUL VL]\n"
+      "st1w { z29.s }, p0, [%x[Cpanel], #3, MUL VL]\n"
+      "st1w { z30.s }, p0, [%x[Cpanel], #4, MUL VL]\n"
+      "st1w { z26.s }, p0, [%x[Cpanel], #5, MUL VL]\n"
+      "st1w { z27.s }, p0, [%x[Cpanel], #6, MUL VL]\n"
+      "st1w { z28.s }, p0, [%x[Cpanel], #7, MUL VL]\n"
+      "addvl %x[Cpanel], %x[Cpanel], #8\n"
+      "bgt 2b\n"
+      "subs %x[ablocks], %x[ablocks], #0x1\n"
+      "bne 1b\n"
+      : [Apanel] "+&r" (Apanel), [Cpanel] "+&r" (Cpanel), [ablocks] "+&r" (ablocks)
+      : [args_ptr] "r" (&ka), [offsetof_Bpanel] "I" (offsetof(KernelArgs, Bpanel)), [offsetof_K] "I" (offsetof(KernelArgs, K)), [offsetof_bblocks] "I" (offsetof(KernelArgs, bblocks))
+      : "cc", "memory", "p0", "x19", "x20", "x21", "x22", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
 }
 
 } // namespace arm_gemm
-
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
index cdad98c5f1..c021539099 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -18804,4 +18804,4 @@ void sve_smallK_hybrid_fp32_mla_8x1VL(const float *A, int lda, const float *B, f
 
 } // namespace arm_gemm
 
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
index cd01411722..489b381624 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -115,19 +115,13 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1rqb z4.b, p6/z, [a_ptr4]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7]\n"
@@ -185,15 +179,7 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
@@ -339,17 +325,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "ld1rqb z6.b, p6/z, [a_ptr6]\n"
@@ -419,21 +399,13 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "sdot z31.s, z16.b, z7.b[0]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "b.ne 4b\n"
@@ -598,17 +570,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "ld1rqb z6.b, p6/z, [a_ptr6]\n"
@@ -689,21 +655,13 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
@@ -892,17 +850,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "ld1rqb z6.b, p6/z, [a_ptr6]\n"
@@ -993,21 +945,13 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
@@ -1221,17 +1165,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "cbz %[loops], 2f\n"
                     "mov z24.s, #0\n"
@@ -1312,7 +1250,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
@@ -1350,19 +1287,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
                     "sdot z25.s, z18.b, z1.b[2]\n"
@@ -1641,17 +1571,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "cbz %[loops], 2f\n"
@@ -1741,7 +1665,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
@@ -1753,7 +1676,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z27.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
@@ -1781,17 +1703,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
@@ -2096,17 +2012,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #7\n"
@@ -2205,13 +2115,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -2245,17 +2153,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "sdot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -2586,17 +2488,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -2704,13 +2600,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -2722,7 +2616,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -2746,15 +2639,10 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
@@ -3111,17 +2999,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -3247,13 +3129,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -3265,7 +3145,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -3289,19 +3168,14 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
                     "sdot z24.s, z18.b, z0.b[2]\n"
@@ -3708,17 +3582,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -3853,13 +3721,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -3871,7 +3737,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -3895,17 +3760,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -4341,17 +4201,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -4495,13 +4349,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -4513,7 +4365,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -4537,17 +4388,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -5010,17 +4856,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -5173,13 +5013,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -5191,7 +5029,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -5215,17 +5052,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -5715,17 +5547,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -5895,13 +5721,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -5913,7 +5737,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -5937,17 +5760,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -6488,17 +6306,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -6677,13 +6489,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -6695,7 +6505,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -6719,17 +6528,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -7297,17 +7101,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -7495,13 +7293,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -7513,7 +7309,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -7537,17 +7332,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -8143,17 +7933,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -8350,13 +8134,11 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -8368,7 +8150,6 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -8392,17 +8173,12 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
                     "sdot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "sdot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "sdot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "sdot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "sdot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "sdot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "sdot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "sdot z29.s, z17.b, z5.b[1]\n"
                     "sdot z30.s, z17.b, z6.b[1]\n"
                     "sdot z31.s, z17.b, z7.b[1]\n"
@@ -8968,4 +8744,4 @@ void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B
 
 } // namespace arm_gemm
 
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
index 99a287b4f5..8ab83e670e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,7 @@
 
 namespace arm_gemm {
 
-void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool) {
+void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
     const long loops_count = iceildiv(N, (int)get_vector_length<uint32_t>()) - 1;
     const long ldab = lda * sizeof(uint8_t);
     const long ldcb = ldc * sizeof(uint32_t);
@@ -115,19 +115,13 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1rqb z4.b, p6/z, [a_ptr4]\n"
-                    "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "ld1rqb z6.b, p6/z, [a_ptr6]\n"
                     "ld1rqb z7.b, p6/z, [a_ptr7]\n"
@@ -185,15 +179,7 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "b.ne 4b\n"
                     "3:\n"
                     "st1w z24.s, p7, [%[c_ptr0]]\n"
@@ -339,17 +325,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "ld1rqb z6.b, p6/z, [a_ptr6]\n"
@@ -419,21 +399,13 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z31.s, #0\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
-                    "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+                    "udot z31.s, z16.b, z7.b[0]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "b.ne 4b\n"
@@ -598,17 +570,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "ld1rqb z6.b, p6/z, [a_ptr6]\n"
@@ -689,21 +655,13 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #3\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
@@ -892,17 +850,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1rqb z1.b, p6/z, [a_ptr1]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1rqb z2.b, p6/z, [a_ptr2]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1rqb z3.b, p6/z, [a_ptr3]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1rqb z4.b, p6/z, [a_ptr4]\n"
                     "ld1rqb z5.b, p6/z, [a_ptr5]\n"
                     "ld1rqb z6.b, p6/z, [a_ptr6]\n"
@@ -993,21 +945,13 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #4\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
@@ -1221,17 +1165,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #5\n"
                     "cbz %[loops], 2f\n"
                     "mov z24.s, #0\n"
@@ -1312,7 +1250,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
@@ -1350,19 +1287,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
                     "udot z25.s, z18.b, z1.b[2]\n"
@@ -1641,17 +1571,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #6\n"
                     "cbz %[loops], 2f\n"
@@ -1741,7 +1665,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
@@ -1753,7 +1676,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z27.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z28.s, p7, [c_ptr4]\n"
                     "addvl c_ptr4, c_ptr4, #1\n"
                     "mov z28.s, #0\n"
@@ -1781,17 +1703,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
@@ -2096,17 +2012,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #7\n"
@@ -2205,13 +2115,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -2245,17 +2153,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z30.s, z16.b, z6.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "udot z24.s, z17.b, z0.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -2586,17 +2488,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -2704,13 +2600,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -2722,7 +2616,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -2746,15 +2639,10 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
@@ -3111,17 +2999,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -3247,13 +3129,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -3265,7 +3145,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -3289,19 +3168,14 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
                     "addvl %[b_ptr0], %[b_ptr0], #1\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
                     "udot z24.s, z18.b, z0.b[2]\n"
@@ -3708,17 +3582,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -3853,13 +3721,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -3871,7 +3737,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -3895,17 +3760,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -4341,17 +4201,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -4495,13 +4349,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -4513,7 +4365,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -4537,17 +4388,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -5010,17 +4856,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -5173,13 +5013,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -5191,7 +5029,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -5215,17 +5052,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -5715,17 +5547,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -5895,13 +5721,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -5913,7 +5737,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -5937,17 +5760,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -6488,17 +6306,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -6677,13 +6489,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -6695,7 +6505,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -6719,17 +6528,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -7297,17 +7101,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -7495,13 +7293,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -7513,7 +7309,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -7537,17 +7332,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -8143,17 +7933,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "ptrue p7.b\n"
                     "whilelt p6.b, %[temp], %[odd_depth]\n"
                     "whilelt p0.s, %[temp], %[last_width]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                     "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
-                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
                     "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
                     "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
@@ -8350,13 +8134,11 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z25.s, #0\n"
                     "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
                     "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
-                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                     "st1w z26.s, p7, [c_ptr2]\n"
                     "addvl c_ptr2, c_ptr2, #1\n"
                     "mov z26.s, #0\n"
                     "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
                     "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
-                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                     "st1w z27.s, p7, [c_ptr3]\n"
                     "addvl c_ptr3, c_ptr3, #1\n"
                     "mov z27.s, #0\n"
@@ -8368,7 +8150,6 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "mov z28.s, #0\n"
                     "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                     "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                     "st1w z29.s, p7, [c_ptr5]\n"
                     "addvl c_ptr5, c_ptr5, #1\n"
                     "mov z29.s, #0\n"
@@ -8392,17 +8173,12 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
                     "udot z29.s, z16.b, z5.b[0]\n"
                     "addvl c_ptr7, c_ptr7, #1\n"
                     "udot z30.s, z16.b, z6.b[0]\n"
-                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                     "udot z31.s, z16.b, z7.b[0]\n"
                     "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
                     "udot z25.s, z17.b, z1.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                     "udot z26.s, z17.b, z2.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                     "udot z27.s, z17.b, z3.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                     "udot z28.s, z17.b, z4.b[1]\n"
-                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                     "udot z29.s, z17.b, z5.b[1]\n"
                     "udot z30.s, z17.b, z6.b[1]\n"
                     "udot z31.s, z17.b, z7.b[1]\n"
@@ -8968,4 +8744,4 @@ void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t
 
 } // namespace arm_gemm
 
-#endif // ARM_COMPUTE_ENABLE_SVE
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp b/src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp
new file mode 100644
index 0000000000..a7525e5ec1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* As some of the merges need these headers, but are all included in the
+ * arm_gemm namespace, put these headers here.  */
+#include <algorithm>
+
+#include <arm_neon.h>
+
+#include "arm_gemm.hpp"
+#include "asmlib.hpp"
+#include "utils.hpp"
+
+#include "mergeresults.hpp"
+
+namespace arm_gemm {
+
+#include "merges/list-fp16.hpp"
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp b/src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp
index 77d86b7dd8..a4124c4a54 100644
--- a/src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,4 +38,4 @@ namespace arm_gemm {
 
 #include "merges/list-sve.hpp"
 
-} // namespace arm_gemm
-\ No newline at end of file
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults.cpp b/src/core/NEON/kernels/arm_gemm/mergeresults.cpp
index bbfe8f23d9..2b712cee61 100644
--- a/src/core/NEON/kernels/arm_gemm/mergeresults.cpp
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018, 2021 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 /* As some of the merges need these headers, but are all included in the
  * arm_gemm namespace, put these headers here.  */
 #include <algorithm>
-#include <limits>
 
 #include <arm_neon.h>
 
diff --git a/src/core/NEON/kernels/arm_gemm/merges/list-fp16.hpp b/src/core/NEON/kernels/arm_gemm/merges/list-fp16.hpp
new file mode 100644
index 0000000000..c1356347df
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/list-fp16.hpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "a64_merge_fp16_24x8.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/merges/list.hpp b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
index dae874ef94..3443c6f0a8 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "a32_merge_float_8x6.hpp"
-#include "a64_merge_fp16_24x8.hpp"
 #include "a64_merge_fp32_12x8.hpp"
 #include "a64_merge_s32_12x8.hpp"
 #include "a64_merge_s32_4x4.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
index fdb4f584d8..1e2a9acc1d 100644
--- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -198,6 +198,19 @@ public:
         _params.bias = bias;
         _params.bias_multi_stride = bias_multi_stride;
     }
+
+    GemmConfig get_config() override {
+        GemmConfig c = _subgemm->get_config();
+
+        std::string n = "quantize_wrapper[";
+        n.append(c.filter);
+        n.append("]");
+
+        c.method = GemmMethod::QUANTIZE_WRAPPER;
+        c.filter = n;
+
+        return c;
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/transform-sve.cpp b/src/core/NEON/kernels/arm_gemm/transform-sve.cpp
new file mode 100644
index 0000000000..3f6963d32b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transform-sve.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "utils.hpp"
+
+#include "bfloat.hpp"
+#include "transform.hpp"
+
+#include <alloca.h>
+
+namespace arm_gemm {
+
+#include "transforms/list-sve.hpp"
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/transform.cpp b/src/core/NEON/kernels/arm_gemm/transform.cpp
new file mode 100644
index 0000000000..60376ab80b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transform.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "utils.hpp"
+
+#include "bfloat.hpp"
+
+#include <alloca.h>
+
+namespace arm_gemm {
+
+/*
+ * Generic transform.
+ *
+ * Assuming the untransposed case, this works by first reading <BlockBy>
+ * consecutive values from the first input row.  This same number of values
+ * are then read from the next <IntBy-1> rows.  Now return to the first
+ * input row and repeat.
+ *
+ * Need to cope with the work requested in either dimension not actually
+ * being a multiple of the block sizes.
+ */
+template <unsigned int tIntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize, VLType vlt>
+struct TransformImpl {
+    template <typename TOut, typename TIn>
+    static void Transform(TOut* out, const TIn* const in, const int stride,
+                          const int y0, const int ymax, const int x0, const int xmax) {
+        // NOTE: This code is disabled to avoid the call to get_vector_length(), so templated transforms will not be
+        // correct for SVE.  This is not an issue as we have specializations for all SVE cases.
+        // For SVE cases we multiply the interleave factor by the vector length.
+        // const unsigned int IntBy = tIntBy * (vlt == VLType::SVE ? get_vector_length<TOut>() / BlockBy : 1);
+        const unsigned int IntBy = tIntBy;
+
+        const int n_whole_y_blocks = (ymax - y0) / IntBy;
+        const int y_remainders = (ymax - y0) % IntBy;
+        const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0);
+
+        const int n_whole_x_blocks = (xmax - x0) / BlockBy;
+        const int x_remainders = (xmax - x0) % BlockBy;
+        const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0);
+
+        // "Y" loop: advance down the rows of the source IntBy rows at a time.
+        // Set up fill_rows to show the number rows to copy from, and blank_rows
+        // for the number of blank rows to add.
+        for (int y_block=0 ; y_block < n_y_blocks; y_block++) {
+            int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
+            int blank_rows = IntBy - fill_rows;
+
+            int y_base = y0 + (y_block * IntBy);
+
+            // So now advance along this block of rows, BlockBy columns at a time.
+            for (int x_block=0 ; x_block < n_x_blocks; x_block++) {
+                int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
+                int blank_cols = BlockBy - fill_cols;
+
+                int x_base = x0 + (x_block * BlockBy);
+
+                for (int row = 0; row < fill_rows; row++) {
+                    for (int col = 0; col < fill_cols; col++) {
+                        // In-range copy.  If it's transposed, we reverse the sense of rows and columns here.
+                        if (Transposed) {
+                            *out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]);
+                        } else {
+                            *out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]);
+                        }
+                    }
+                    // "col" tail - row is in range but column is out of range.
+                    for (int col=0; col < blank_cols; col++) {
+                        *out++ = static_cast<TOut>(0);
+                    }
+                }
+                // "row" tail - row is out of range so fill with zeros always.
+                TOut zeroval = static_cast<TOut>(0);
+                int pads = blank_rows * (fill_cols + blank_cols);
+
+                for (int i=0; i<pads; i++) {
+                    out[i] = zeroval;
+                }
+
+                out += pads;
+            }
+        }
+    }
+
+    template <typename T>
+    static void Transform(T* out, const T* const in, const int stride,
+                                 const int k0, const int kmax, const int x0, const int xmax) {
+        Transform<T, T>(out, in, stride, k0, kmax, x0, xmax);
+    }
+};
+
+/*****************************************************************************/
+template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, VLType vlt=VLType::None, typename TOut, typename TIn>
+void Transform(
+  TOut* out, const TIn* const in, const int stride,
+  const int k0, const int kmax, const int x0, const int xmax
+) {
+  // Redirect to a specialised implementation predicated on argument size.
+  TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn), vlt>::Transform(
+    out, in, stride, k0, kmax, x0, xmax
+  );
+}
+/*****************************************************************************/
+
+#include "transforms/list.hpp"
+
+// We don't have assembler transforms for AArch32, generate templated ones here.
+#ifdef __arm__
+template void Transform<8, 1, true, VLType::None>(float *, const float *, int, int, int, int, int);
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+template void Transform<8, 1, true, VLType::None>(float *, const __fp16 *, int, int, int, int, int);
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+template void Transform<8, 1, true, VLType::None>(float *, const bfloat16 *, int, int, int, int, int);
+#endif // AArch32
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index 5efeee5d35..f46e6c5fa3 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,96 +27,10 @@
 
 namespace arm_gemm {
 
-/*
- * Generic transform.
- *
- * Assuming the untransposed case, this works by first reading <BlockBy>
- * consecutive values from the first input row.  This same number of values
- * are then read from the next <IntBy-1> rows.  Now return to the first
- * input row and repeat.
- *
- * Need to cope with the work requested in either dimension not actually
- * being a multiple of the block sizes.
- */
-template <unsigned int tIntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize, VLType vlt>
-struct TransformImpl {
-    template <typename TOut, typename TIn>
-    static void Transform(TOut* out, const TIn* const in, const int stride,
-                          const int y0, const int ymax, const int x0, const int xmax) {
-        // For SVE cases we multiply the interleave factor by the vector length.
-        const unsigned int IntBy = tIntBy * (vlt == VLType::SVE ? get_vector_length<TOut>() / BlockBy : 1);
-
-        const int n_whole_y_blocks = (ymax - y0) / IntBy;
-        const int y_remainders = (ymax - y0) % IntBy;
-        const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0);
-
-        const int n_whole_x_blocks = (xmax - x0) / BlockBy;
-        const int x_remainders = (xmax - x0) % BlockBy;
-        const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0);
-
-        // "Y" loop: advance down the rows of the source IntBy rows at a time.
-        // Set up fill_rows to show the number rows to copy from, and blank_rows
-        // for the number of blank rows to add.
-        for (int y_block=0 ; y_block < n_y_blocks; y_block++) {
-            int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
-            int blank_rows = IntBy - fill_rows;
-
-            int y_base = y0 + (y_block * IntBy);
-
-            // So now advance along this block of rows, BlockBy columns at a time.
-            for (int x_block=0 ; x_block < n_x_blocks; x_block++) {
-                int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
-                int blank_cols = BlockBy - fill_cols;
-
-                int x_base = x0 + (x_block * BlockBy);
-
-                for (int row = 0; row < fill_rows; row++) {
-                    for (int col = 0; col < fill_cols; col++) {
-                        // In-range copy.  If it's transposed, we reverse the sense of rows and columns here.
-                        if (Transposed) {
-                            *out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]);
-                        } else {
-                            *out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]);
-                        }
-                    }
-                    // "col" tail - row is in range but column is out of range.
-                    for (int col=0; col < blank_cols; col++) {
-                        *out++ = static_cast<TOut>(0);
-                    }
-                }
-                // "row" tail - row is out of range so fill with zeros always.
-                TOut zeroval = static_cast<TOut>(0);
-                int pads = blank_rows * (fill_cols + blank_cols);
-
-                for (int i=0; i<pads; i++) {
-                    out[i] = zeroval;
-                }
-
-                out += pads;
-            }
-        }
-    }
-
-    template <typename T>
-    static inline void Transform(T* out, const T* const in, const int stride,
-                                 const int k0, const int kmax, const int x0, const int xmax) {
-        Transform<T, T>(out, in, stride, k0, kmax, x0, xmax);
-    }
-};
-
-/*****************************************************************************/
 template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, VLType vlt=VLType::None, typename TOut, typename TIn>
 void Transform(
   TOut* out, const TIn* const in, const int stride,
   const int k0, const int kmax, const int x0, const int xmax
-) {
-  // Redirect to a specialised implementation predicated on argument size.
-  TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn), vlt>::Transform(
-    out, in, stride, k0, kmax, x0, xmax
-  );
-}
-/*****************************************************************************/
-
-#include "transforms/list.hpp"
+);
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
index 3ce1d328a7..b50c240a3a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,7 @@
 // Generic unblocked transposed 8x32-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
+void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
@@ -45,7 +45,7 @@ inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
 // Generic 16x16-bit sized specialisation
 template <>
 template <typename T>
-inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
+void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     T* out, const T* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
@@ -59,7 +59,7 @@ inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
 
 // Specialised 16 x uint16_t version
 template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
+void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
   __asm volatile (
     "VLD1.32	{d0-d3}, [%[in0]]!\n"
     "VST1.32	{d0-d3}, [%[out]]\n"
@@ -72,7 +72,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(con
 }
 
 template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
+void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
   __asm volatile (
     "VLD1.32	{d0-d3}, [%[in0]]!\n"
     "VST1.32	{d0-d3}, [%[out]]!\n"
@@ -90,7 +90,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(con
 }
 
 template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
+void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
   __asm __volatile (
     "VLD1.32	{d0-d3}, [%[in0]]!\n"
     "VST1.32	{d0-d3}, [%[out]]!\n"
@@ -117,7 +117,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(con
 
 template <>
 template <>
-inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
+void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
     uint16_t* out, const uint16_t* const in, const int stride,
     const int x0, const int xmax, const int k0, const int kmax
 ) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
new file mode 100644
index 0000000000..41c1c282e5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_128.hpp
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_128(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 32 * height * sizeof(uint32_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add x22, x24, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x20\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q15, [x24], #0x10\n"
+      "sub x19, x19, #0x20\n"
+      "ldr q14, [x22], #0x10\n"
+      "cmp x19, #0x20\n"
+      "ldr q13, [x21], #0x10\n"
+      "ldr q12, [x20], #0x10\n"
+      "ldr q11, [x24], #0x10\n"
+      "ldr q10, [x22], #0x10\n"
+      "ldr q9, [x21], #0x10\n"
+      "ldr q8, [x20], #0x10\n"
+      "ldr q7, [x24], #0x10\n"
+      "ldr q6, [x22], #0x10\n"
+      "ldr q5, [x21], #0x10\n"
+      "ldr q4, [x20], #0x10\n"
+      "ldr q3, [x24], #0x10\n"
+      "ldr q2, [x22], #0x10\n"
+      "ldr q1, [x21], #0x10\n"
+      "ldr q0, [x20], #0x10\n"
+      "ldr q31, [x24], #0x10\n"
+      "ldr q30, [x22], #0x10\n"
+      "ldr q29, [x21], #0x10\n"
+      "ldr q28, [x20], #0x10\n"
+      "ldr q27, [x24], #0x10\n"
+      "ldr q26, [x22], #0x10\n"
+      "ldr q25, [x21], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x24], #0x10\n"
+      "ldr q22, [x22], #0x10\n"
+      "ldr q21, [x21], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q15, [x23, #0x0]\n"
+      "str q11, [x23, #0x10]\n"
+      "str q7, [x23, #0x20]\n"
+      "str q3, [x23, #0x30]\n"
+      "str q31, [x23, #0x40]\n"
+      "str q27, [x23, #0x50]\n"
+      "str q23, [x23, #0x60]\n"
+      "str q19, [x23, #0x70]\n"
+      "str q14, [x23, #0x80]\n"
+      "str q10, [x23, #0x90]\n"
+      "str q6, [x23, #0xa0]\n"
+      "str q2, [x23, #0xb0]\n"
+      "str q30, [x23, #0xc0]\n"
+      "str q26, [x23, #0xd0]\n"
+      "str q22, [x23, #0xe0]\n"
+      "str q18, [x23, #0xf0]\n"
+      "str q13, [x23, #0x100]\n"
+      "str q9, [x23, #0x110]\n"
+      "str q5, [x23, #0x120]\n"
+      "str q1, [x23, #0x130]\n"
+      "str q29, [x23, #0x140]\n"
+      "str q25, [x23, #0x150]\n"
+      "str q21, [x23, #0x160]\n"
+      "str q17, [x23, #0x170]\n"
+      "str q12, [x23, #0x180]\n"
+      "str q8, [x23, #0x190]\n"
+      "str q4, [x23, #0x1a0]\n"
+      "str q0, [x23, #0x1b0]\n"
+      "str q28, [x23, #0x1c0]\n"
+      "str q24, [x23, #0x1d0]\n"
+      "str q20, [x23, #0x1e0]\n"
+      "str q16, [x23, #0x1f0]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x19, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 16 loop: loop
+      "ldr q31, [x24], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q30, [x22], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q29, [x21], #0x10\n"
+      "ldr q28, [x20], #0x10\n"
+      "ldr q27, [x24], #0x10\n"
+      "ldr q26, [x22], #0x10\n"
+      "ldr q25, [x21], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x24], #0x10\n"
+      "ldr q22, [x22], #0x10\n"
+      "ldr q21, [x21], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q31, [x23, #0x0]\n"
+      "str q27, [x23, #0x10]\n"
+      "str q23, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q30, [x23, #0x80]\n"
+      "str q26, [x23, #0x90]\n"
+      "str q22, [x23, #0xa0]\n"
+      "str q18, [x23, #0xb0]\n"
+      "str q29, [x23, #0x100]\n"
+      "str q25, [x23, #0x110]\n"
+      "str q21, [x23, #0x120]\n"
+      "str q17, [x23, #0x130]\n"
+      "str q28, [x23, #0x180]\n"
+      "str q24, [x23, #0x190]\n"
+      "str q20, [x23, #0x1a0]\n"
+      "str q16, [x23, #0x1b0]\n"
+      "add x23, x23, #0x40\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 16 loop: skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr q19, [x24], #0x10\n"
+      "sub x19, x19, #0x4\n"
+      "ldr q18, [x22], #0x10\n"
+      "cmp x19, #0x4\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q19, [x23, #0x0]\n"
+      "str q18, [x23, #0x80]\n"
+      "str q17, [x23, #0x100]\n"
+      "str q16, [x23, #0x180]\n"
+      "add x23, x23, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr s19, [x24], #0x4\n"
+      "sub x19, x19, #0x1\n"
+      "ldr s18, [x22], #0x4\n"
+      "cmp x19, #0x1\n"
+      "ldr s17, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "str s19, [x23, #0x0]\n"
+      "str s18, [x23, #0x80]\n"
+      "str s17, [x23, #0x100]\n"
+      "str s16, [x23, #0x180]\n"
+      "add x23, x23, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x200\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x20\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q23, [x24], #0x10\n"
+      "sub x19, x19, #0x20\n"
+      "cmp x19, #0x20\n"
+      "ldr q22, [x24], #0x10\n"
+      "ldr q21, [x24], #0x10\n"
+      "ldr q20, [x24], #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q18, [x24], #0x10\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x24], #0x10\n"
+      "str q23, [x23, #0x0]\n"
+      "str q22, [x23, #0x10]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q20, [x23, #0x30]\n"
+      "str q19, [x23, #0x40]\n"
+      "str q18, [x23, #0x50]\n"
+      "str q17, [x23, #0x60]\n"
+      "str q16, [x23, #0x70]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 16 loop: loop
+      "ldr q19, [x24], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q18, [x24], #0x10\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x24], #0x10\n"
+      "str q19, [x23, #0x0]\n"
+      "str q18, [x23, #0x10]\n"
+      "str q17, [x23, #0x20]\n"
+      "str q16, [x23, #0x30]\n"
+      "add x23, x23, #0x40\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 16 loop: skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr q16, [x24], #0x10\n"
+      "sub x19, x19, #0x4\n"
+      "cmp x19, #0x4\n"
+      "str q16, [x23, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr s16, [x24], #0x4\n"
+      "sub x19, x19, #0x1\n"
+      "cmp x19, #0x1\n"
+      "str s16, [x23, #0x0]\n"
+      "add x23, x23, #0x4\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x80\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<32, 1, true, VLType::None>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_128(
+        reinterpret_cast<uint32_t *>(out),
+        reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 4,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
new file mode 100644
index 0000000000..ec3273a526
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x4.hpp
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_12_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 12 * roundup<size_t>(height, 4) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x30\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q18, [x28], #0x10\n"
+      "sub x19, x19, #0x30\n"
+      "ldr q23, [x26], #0x10\n"
+      "cmp x19, #0x30\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v19.16b, v18.16b, v16.16b\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v22.16b, v18.16b, v16.16b\n"
+      "ldr q11, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v21.16b, v17.16b, v16.16b\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v10.16b, v17.16b, v16.16b\n"
+      "ldr q9, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v8.16b, v18.16b, v17.16b\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v7.16b, v18.16b, v17.16b\n"
+      "ldr q20, [x23], #0x10\n"
+      "ldr q6, [x22], #0x10\n"
+      "zip1 v17.16b, v23.16b, v16.16b\n"
+      "ldr q5, [x24], #0x10\n"
+      "zip2 v16.16b, v23.16b, v16.16b\n"
+      "ldr q4, [x23], #0x10\n"
+      "zip1 v3.16b, v19.16b, v17.16b\n"
+      "ldr q2, [x22], #0x10\n"
+      "zip2 v1.16b, v19.16b, v17.16b\n"
+      "ldr q19, [x21], #0x10\n"
+      "zip1 v0.16b, v22.16b, v16.16b\n"
+      "ldr q31, [x24], #0x10\n"
+      "zip2 v30.16b, v22.16b, v16.16b\n"
+      "ldr q29, [x23], #0x10\n"
+      "zip1 v16.16b, v11.16b, v5.16b\n"
+      "ldr q28, [x22], #0x10\n"
+      "zip1 v27.16b, v21.16b, v16.16b\n"
+      "ldr q26, [x21], #0x10\n"
+      "zip1 v18.16b, v20.16b, v19.16b\n"
+      "ldr q17, [x20], #0x10\n"
+      "zip2 v20.16b, v20.16b, v19.16b\n"
+      "ldr q25, [x21], #0x10\n"
+      "zip2 v24.16b, v21.16b, v16.16b\n"
+      "zip1 v23.16b, v4.16b, v26.16b\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v16.16b, v6.16b, v17.16b\n"
+      "ldr q21, [x20], #0x10\n"
+      "zip1 v19.16b, v18.16b, v16.16b\n"
+      "zip2 v18.16b, v18.16b, v16.16b\n"
+      "str q3, [x27, #0x0]\n"
+      "zip2 v16.16b, v6.16b, v17.16b\n"
+      "str q1, [x27, #0x10]\n"
+      "zip1 v17.16b, v20.16b, v16.16b\n"
+      "str q0, [x27, #0x20]\n"
+      "zip2 v20.16b, v20.16b, v16.16b\n"
+      "str q19, [x27, #0x30]\n"
+      "zip1 v16.16b, v2.16b, v22.16b\n"
+      "str q18, [x27, #0x40]\n"
+      "zip1 v19.16b, v23.16b, v16.16b\n"
+      "str q17, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip2 v18.16b, v23.16b, v16.16b\n"
+      "str q30, [x27, #0x0]\n"
+      "zip2 v17.16b, v11.16b, v5.16b\n"
+      "str q27, [x27, #0x10]\n"
+      "zip1 v16.16b, v10.16b, v17.16b\n"
+      "str q24, [x27, #0x20]\n"
+      "zip2 v17.16b, v10.16b, v17.16b\n"
+      "str q20, [x27, #0x30]\n"
+      "zip1 v20.16b, v9.16b, v31.16b\n"
+      "str q19, [x27, #0x40]\n"
+      "zip1 v19.16b, v8.16b, v20.16b\n"
+      "str q18, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip2 v18.16b, v4.16b, v26.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.16b, v2.16b, v22.16b\n"
+      "str q17, [x27, #0x10]\n"
+      "zip1 v17.16b, v18.16b, v16.16b\n"
+      "str q19, [x27, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v16.16b\n"
+      "str q17, [x27, #0x30]\n"
+      "zip1 v19.16b, v29.16b, v25.16b\n"
+      "str q16, [x27, #0x40]\n"
+      "zip1 v17.16b, v28.16b, v21.16b\n"
+      "zip1 v16.16b, v19.16b, v17.16b\n"
+      "str q16, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip2 v16.16b, v8.16b, v20.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v18.16b, v9.16b, v31.16b\n"
+      "zip2 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v7.16b, v18.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip2 v16.16b, v7.16b, v18.16b\n"
+      "str q16, [x27, #0x20]\n"
+      "zip2 v18.16b, v29.16b, v25.16b\n"
+      "str q17, [x27, #0x30]\n"
+      "zip2 v17.16b, v28.16b, v21.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr d19, [x28], #0x8\n"
+      "sub x19, x19, #0xc\n"
+      "ldr d18, [x26], #0x8\n"
+      "cmp x19, #0xc\n"
+      "ldr d17, [x25], #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d24, [x23], #0x8\n"
+      "ld1 { v19.s }[2], [x28], #0x4\n"
+      "ld1 { v18.s }[2], [x26], #0x4\n"
+      "ld1 { v17.s }[2], [x25], #0x4\n"
+      "zip1 v23.16b, v19.16b, v17.16b\n"
+      "ld1 { v16.s }[2], [x24], #0x4\n"
+      "zip2 v20.16b, v19.16b, v17.16b\n"
+      "ld1 { v24.s }[2], [x23], #0x4\n"
+      "ldr d22, [x22], #0x8\n"
+      "zip1 v17.16b, v18.16b, v16.16b\n"
+      "ldr d19, [x21], #0x8\n"
+      "zip2 v16.16b, v18.16b, v16.16b\n"
+      "ld1 { v22.s }[2], [x22], #0x4\n"
+      "zip1 v18.16b, v23.16b, v17.16b\n"
+      "ldr d21, [x20], #0x8\n"
+      "zip2 v17.16b, v23.16b, v17.16b\n"
+      "ld1 { v19.s }[2], [x21], #0x4\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "ld1 { v21.s }[2], [x20], #0x4\n"
+      "zip1 v20.16b, v24.16b, v19.16b\n"
+      "str q18, [x27, #0x0]\n"
+      "zip2 v19.16b, v24.16b, v19.16b\n"
+      "str q17, [x27, #0x10]\n"
+      "str q16, [x27, #0x20]\n"
+      "zip1 v18.16b, v22.16b, v21.16b\n"
+      "zip2 v17.16b, v22.16b, v21.16b\n"
+      "zip1 v16.16b, v20.16b, v18.16b\n"
+      "str q16, [x27, #0x30]\n"
+      "zip2 v16.16b, v20.16b, v18.16b\n"
+      "str q16, [x27, #0x40]\n"
+      "zip1 v16.16b, v19.16b, v17.16b\n"
+      "str q16, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr s18, [x28], #0x4\n"
+      "sub x19, x19, #0x4\n"
+      "ldr s17, [x26], #0x4\n"
+      "cmp x19, #0x4\n"
+      "ldr s16, [x25], #0x4\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr s16, [x24], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s17, [x21], #0x4\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "str q18, [x27, #0x0]\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x27, #0x30]\n"
+      "add x27, x27, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr b18, [x28], #0x1\n"
+      "sub x19, x19, #0x1\n"
+      "ldr b17, [x26], #0x1\n"
+      "cmp x19, #0x1\n"
+      "ldr b16, [x25], #0x1\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr b16, [x24], #0x1\n"
+      "ldr b20, [x23], #0x1\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "ldr b19, [x22], #0x1\n"
+      "ldr b17, [x21], #0x1\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr b16, [x20], #0x1\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "str s18, [x27, #0x0]\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s16, [x27, #0x30]\n"
+      "add x27, x27, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x60\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x30\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q18, [x28], #0x10\n"
+      "sub x19, x19, #0x30\n"
+      "ldr q19, [x26], #0x10\n"
+      "cmp x19, #0x30\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v28.16b, v18.16b, v16.16b\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v27.16b, v18.16b, v16.16b\n"
+      "ldr q26, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v25.16b, v17.16b, v16.16b\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v24.16b, v17.16b, v16.16b\n"
+      "ldr q23, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v22.16b, v18.16b, v16.16b\n"
+      "ldr q17, [x24], #0x10\n"
+      "zip2 v21.16b, v18.16b, v16.16b\n"
+      "ldr q20, [x24], #0x10\n"
+      "zip1 v16.16b, v19.16b, v17.16b\n"
+      "zip2 v18.16b, v19.16b, v17.16b\n"
+      "ldr q19, [x24], #0x10\n"
+      "zip1 v17.16b, v28.16b, v16.16b\n"
+      "zip2 v16.16b, v28.16b, v16.16b\n"
+      "str q17, [x27, #0x0]\n"
+      "zip1 v17.16b, v27.16b, v18.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip2 v16.16b, v27.16b, v18.16b\n"
+      "str q17, [x27, #0x20]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip1 v18.16b, v26.16b, v20.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v17.16b, v26.16b, v20.16b\n"
+      "zip1 v16.16b, v25.16b, v18.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip2 v16.16b, v25.16b, v18.16b\n"
+      "str q16, [x27, #0x20]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip1 v16.16b, v24.16b, v17.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.16b, v24.16b, v17.16b\n"
+      "zip1 v17.16b, v23.16b, v19.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip1 v16.16b, v22.16b, v17.16b\n"
+      "str q16, [x27, #0x20]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip2 v16.16b, v22.16b, v17.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v17.16b, v23.16b, v19.16b\n"
+      "zip1 v16.16b, v21.16b, v17.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip2 v16.16b, v21.16b, v17.16b\n"
+      "str q16, [x27, #0x20]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr d18, [x28], #0x8\n"
+      "sub x19, x19, #0xc\n"
+      "ldr d21, [x26], #0x8\n"
+      "cmp x19, #0xc\n"
+      "ldr d17, [x25], #0x8\n"
+      "ldr d16, [x24], #0x8\n"
+      "ld1 { v18.s }[2], [x28], #0x4\n"
+      "ld1 { v21.s }[2], [x26], #0x4\n"
+      "ld1 { v17.s }[2], [x25], #0x4\n"
+      "zip1 v20.16b, v18.16b, v17.16b\n"
+      "ld1 { v16.s }[2], [x24], #0x4\n"
+      "zip2 v19.16b, v18.16b, v17.16b\n"
+      "zip1 v18.16b, v21.16b, v16.16b\n"
+      "zip2 v17.16b, v21.16b, v16.16b\n"
+      "zip1 v16.16b, v20.16b, v18.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.16b, v20.16b, v18.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip1 v16.16b, v19.16b, v17.16b\n"
+      "str q16, [x27, #0x20]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr s17, [x28], #0x4\n"
+      "sub x19, x19, #0x4\n"
+      "ldr s18, [x26], #0x4\n"
+      "cmp x19, #0x4\n"
+      "ldr s16, [x25], #0x4\n"
+      "zip1 v17.16b, v17.16b, v16.16b\n"
+      "ldr s16, [x24], #0x4\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr b17, [x28], #0x1\n"
+      "sub x19, x19, #0x1\n"
+      "ldr b18, [x26], #0x1\n"
+      "cmp x19, #0x1\n"
+      "ldr b16, [x25], #0x1\n"
+      "zip1 v17.16b, v17.16b, v16.16b\n"
+      "ldr b16, [x24], #0x1\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s16, [x27, #0x0]\n"
+      "add x27, x27, #0x4\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x30\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<12, 4, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<12, 4, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
new file mode 100644
index 0000000000..1603be2ef8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_1x8.hpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_12_1x8(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 8) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 12 * roundup<size_t>(height, 8) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+
+      "1:"  // Main row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "cmp %x[height], #0x7\n"
+      "csel x20, x20, %x[pad_row], GT\n"
+      "csel x21, x21, %x[pad_row], GE\n"
+      "cmp %x[height], #0x5\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "csel x23, x23, %x[pad_row], GE\n"
+      "cmp %x[height], #0x3\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x30\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q18, [x28], #0x10\n"
+      "sub x19, x19, #0x30\n"
+      "ldr q19, [x26], #0x10\n"
+      "cmp x19, #0x30\n"
+      "ldr q11, [x25], #0x10\n"
+      "ldr q10, [x24], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v22.16b, v18.16b, v16.16b\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v9.16b, v18.16b, v16.16b\n"
+      "ldr q8, [x26], #0x10\n"
+      "ldr q7, [x25], #0x10\n"
+      "ldr q6, [x24], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v5.16b, v17.16b, v16.16b\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v4.16b, v17.16b, v16.16b\n"
+      "ldr q3, [x26], #0x10\n"
+      "ldr q2, [x25], #0x10\n"
+      "ldr q1, [x24], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "zip1 v0.16b, v18.16b, v17.16b\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip2 v31.16b, v18.16b, v17.16b\n"
+      "ldr q30, [x21], #0x10\n"
+      "ldr q29, [x20], #0x10\n"
+      "zip1 v28.16b, v19.16b, v16.16b\n"
+      "ldr q27, [x22], #0x10\n"
+      "zip2 v21.16b, v19.16b, v16.16b\n"
+      "ldr q26, [x21], #0x10\n"
+      "zip1 v16.16b, v11.16b, v30.16b\n"
+      "ldr q25, [x20], #0x10\n"
+      "zip1 v20.16b, v22.16b, v16.16b\n"
+      "ldr q24, [x22], #0x10\n"
+      "zip1 v19.16b, v10.16b, v29.16b\n"
+      "zip2 v18.16b, v22.16b, v16.16b\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip1 v17.16b, v28.16b, v19.16b\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v16.16b, v20.16b, v17.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.16b, v20.16b, v17.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip2 v17.16b, v28.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x30]\n"
+      "zip2 v20.16b, v11.16b, v30.16b\n"
+      "zip1 v18.16b, v9.16b, v20.16b\n"
+      "zip2 v19.16b, v10.16b, v29.16b\n"
+      "zip1 v17.16b, v21.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip2 v18.16b, v9.16b, v20.16b\n"
+      "zip2 v17.16b, v21.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip1 v21.16b, v7.16b, v26.16b\n"
+      "zip1 v18.16b, v5.16b, v21.16b\n"
+      "zip1 v20.16b, v8.16b, v27.16b\n"
+      "zip1 v19.16b, v6.16b, v25.16b\n"
+      "zip1 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x30]\n"
+      "zip2 v18.16b, v5.16b, v21.16b\n"
+      "zip2 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip2 v21.16b, v7.16b, v26.16b\n"
+      "zip2 v20.16b, v8.16b, v27.16b\n"
+      "zip1 v18.16b, v4.16b, v21.16b\n"
+      "zip2 v19.16b, v6.16b, v25.16b\n"
+      "zip1 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip2 v18.16b, v4.16b, v21.16b\n"
+      "zip2 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x30]\n"
+      "zip1 v21.16b, v2.16b, v23.16b\n"
+      "zip1 v18.16b, v0.16b, v21.16b\n"
+      "zip1 v20.16b, v3.16b, v24.16b\n"
+      "zip1 v19.16b, v1.16b, v22.16b\n"
+      "zip1 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip2 v18.16b, v0.16b, v21.16b\n"
+      "zip2 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip2 v21.16b, v2.16b, v23.16b\n"
+      "zip1 v18.16b, v31.16b, v21.16b\n"
+      "zip2 v20.16b, v3.16b, v24.16b\n"
+      "zip2 v19.16b, v1.16b, v22.16b\n"
+      "zip1 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x30]\n"
+      "zip2 v18.16b, v31.16b, v21.16b\n"
+      "zip2 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr d20, [x28], #0x8\n"
+      "sub x19, x19, #0xc\n"
+      "ldr d19, [x26], #0x8\n"
+      "cmp x19, #0xc\n"
+      "ldr d18, [x25], #0x8\n"
+      "ldr d27, [x24], #0x8\n"
+      "ldr d16, [x23], #0x8\n"
+      "ld1 { v20.s }[2], [x28], #0x4\n"
+      "ld1 { v19.s }[2], [x26], #0x4\n"
+      "ld1 { v18.s }[2], [x25], #0x4\n"
+      "ld1 { v27.s }[2], [x24], #0x4\n"
+      "ld1 { v16.s }[2], [x23], #0x4\n"
+      "zip1 v26.16b, v20.16b, v16.16b\n"
+      "ldr d17, [x22], #0x8\n"
+      "zip2 v25.16b, v20.16b, v16.16b\n"
+      "ldr d16, [x21], #0x8\n"
+      "ldr d24, [x20], #0x8\n"
+      "ld1 { v17.s }[2], [x22], #0x4\n"
+      "zip1 v23.16b, v19.16b, v17.16b\n"
+      "ld1 { v16.s }[2], [x21], #0x4\n"
+      "zip2 v22.16b, v19.16b, v17.16b\n"
+      "ld1 { v24.s }[2], [x20], #0x4\n"
+      "zip1 v21.16b, v18.16b, v16.16b\n"
+      "zip2 v20.16b, v18.16b, v16.16b\n"
+      "zip1 v18.16b, v26.16b, v21.16b\n"
+      "zip1 v19.16b, v27.16b, v24.16b\n"
+      "zip1 v17.16b, v23.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip2 v18.16b, v26.16b, v21.16b\n"
+      "zip2 v17.16b, v23.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x30]\n"
+      "zip1 v18.16b, v25.16b, v20.16b\n"
+      "zip2 v16.16b, v27.16b, v24.16b\n"
+      "zip1 v17.16b, v22.16b, v16.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr s17, [x28], #0x4\n"
+      "sub x19, x19, #0x4\n"
+      "ldr s21, [x26], #0x4\n"
+      "cmp x19, #0x4\n"
+      "ldr s18, [x25], #0x4\n"
+      "ldr s20, [x24], #0x4\n"
+      "ldr s16, [x23], #0x4\n"
+      "zip1 v19.16b, v17.16b, v16.16b\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr s16, [x21], #0x4\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v17.16b, v21.16b, v17.16b\n"
+      "zip1 v18.16b, v19.16b, v18.16b\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "zip1 v17.16b, v17.16b, v16.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "add x27, x27, #0x20\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr b18, [x28], #0x1\n"
+      "sub x19, x19, #0x1\n"
+      "ldr b21, [x26], #0x1\n"
+      "cmp x19, #0x1\n"
+      "ldr b17, [x25], #0x1\n"
+      "ldr b20, [x24], #0x1\n"
+      "ldr b16, [x23], #0x1\n"
+      "zip1 v19.16b, v18.16b, v16.16b\n"
+      "ldr b18, [x22], #0x1\n"
+      "ldr b16, [x21], #0x1\n"
+      "zip1 v17.16b, v17.16b, v16.16b\n"
+      "ldr b16, [x20], #0x1\n"
+      "zip1 v18.16b, v21.16b, v18.16b\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str d16, [x27, #0x0]\n"
+      "add x27, x27, #0x8\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x60\n"
+      "cmp %x[height], #0x1\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<12, 8, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<12, 8, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
new file mode 100644
index 0000000000..78301353fd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x2.hpp
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_12_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 12 * roundup<size_t>(height, 2) * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q17, [x28], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v9.8h, v17.8h, v16.8h\n"
+      "ldr q19, [x28], #0x10\n"
+      "cmp x19, #0x18\n"
+      "zip2 v8.8h, v17.8h, v16.8h\n"
+      "ldr q16, [x26], #0x10\n"
+      "ldr q18, [x25], #0x10\n"
+      "zip1 v7.8h, v19.8h, v16.8h\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v6.8h, v19.8h, v16.8h\n"
+      "ldr q16, [x26], #0x10\n"
+      "ldr q20, [x25], #0x10\n"
+      "zip1 v5.8h, v17.8h, v16.8h\n"
+      "ldr q22, [x25], #0x10\n"
+      "zip2 v4.8h, v17.8h, v16.8h\n"
+      "ldr q16, [x24], #0x10\n"
+      "ldr q19, [x23], #0x10\n"
+      "zip1 v3.8h, v18.8h, v16.8h\n"
+      "ldr q17, [x24], #0x10\n"
+      "zip2 v2.8h, v18.8h, v16.8h\n"
+      "ldr q21, [x23], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v1.8h, v20.8h, v17.8h\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v0.8h, v20.8h, v17.8h\n"
+      "ldr q31, [x23], #0x10\n"
+      "zip1 v30.8h, v19.8h, v18.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "zip2 v29.8h, v19.8h, v18.8h\n"
+      "ldr q20, [x21], #0x10\n"
+      "ldr q19, [x20], #0x10\n"
+      "zip1 v28.8h, v22.8h, v16.8h\n"
+      "zip2 v27.8h, v22.8h, v16.8h\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v26.8h, v21.8h, v17.8h\n"
+      "zip2 v25.8h, v21.8h, v17.8h\n"
+      "ldr q18, [x21], #0x10\n"
+      "zip1 v24.8h, v20.8h, v19.8h\n"
+      "ldr q17, [x20], #0x10\n"
+      "zip2 v23.8h, v20.8h, v19.8h\n"
+      "ldr q22, [x21], #0x10\n"
+      "zip1 v21.8h, v31.8h, v16.8h\n"
+      "zip2 v20.8h, v31.8h, v16.8h\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v19.8h, v18.8h, v17.8h\n"
+      "str q9, [x27, #0x0]\n"
+      "zip2 v18.8h, v18.8h, v17.8h\n"
+      "str q8, [x27, #0x10]\n"
+      "str q7, [x27, #0x20]\n"
+      "zip1 v17.8h, v22.8h, v16.8h\n"
+      "str q3, [x27, #0x30]\n"
+      "zip2 v16.8h, v22.8h, v16.8h\n"
+      "str q2, [x27, #0x40]\n"
+      "str q1, [x27, #0x50]\n"
+      "str q30, [x27, #0x60]\n"
+      "str q29, [x27, #0x70]\n"
+      "str q26, [x27, #0x80]\n"
+      "str q24, [x27, #0x90]\n"
+      "str q23, [x27, #0xa0]\n"
+      "str q19, [x27, #0xb0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "str q6, [x27, #0x0]\n"
+      "str q5, [x27, #0x10]\n"
+      "str q4, [x27, #0x20]\n"
+      "str q0, [x27, #0x30]\n"
+      "str q28, [x27, #0x40]\n"
+      "str q27, [x27, #0x50]\n"
+      "str q25, [x27, #0x60]\n"
+      "str q21, [x27, #0x70]\n"
+      "str q20, [x27, #0x80]\n"
+      "str q18, [x27, #0x90]\n"
+      "str q17, [x27, #0xa0]\n"
+      "str q16, [x27, #0xb0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q18, [x28], #0x10\n"
+      "sub x19, x19, #0xc\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v29.8h, v18.8h, v16.8h\n"
+      "ldr d17, [x28], #0x8\n"
+      "cmp x19, #0xc\n"
+      "zip2 v28.8h, v18.8h, v16.8h\n"
+      "ldr d16, [x26], #0x8\n"
+      "ldr q19, [x25], #0x10\n"
+      "zip1 v27.8h, v17.8h, v16.8h\n"
+      "ldr d18, [x25], #0x8\n"
+      "ldr q17, [x24], #0x10\n"
+      "zip1 v26.8h, v19.8h, v17.8h\n"
+      "ldr d16, [x24], #0x8\n"
+      "zip2 v25.8h, v19.8h, v17.8h\n"
+      "ldr q19, [x23], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "zip1 v24.8h, v18.8h, v16.8h\n"
+      "ldr q18, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip1 v23.8h, v19.8h, v17.8h\n"
+      "zip2 v22.8h, v19.8h, v17.8h\n"
+      "ldr d21, [x23], #0x8\n"
+      "ldr d17, [x22], #0x8\n"
+      "zip1 v20.8h, v18.8h, v16.8h\n"
+      "ldr d19, [x21], #0x8\n"
+      "zip2 v18.8h, v18.8h, v16.8h\n"
+      "ldr d16, [x20], #0x8\n"
+      "str q29, [x27, #0x0]\n"
+      "zip1 v17.8h, v21.8h, v17.8h\n"
+      "str q28, [x27, #0x10]\n"
+      "zip1 v16.8h, v19.8h, v16.8h\n"
+      "str q27, [x27, #0x20]\n"
+      "str q26, [x27, #0x30]\n"
+      "str q25, [x27, #0x40]\n"
+      "str q24, [x27, #0x50]\n"
+      "str q23, [x27, #0x60]\n"
+      "str q22, [x27, #0x70]\n"
+      "str q17, [x27, #0x80]\n"
+      "str q20, [x27, #0x90]\n"
+      "str q18, [x27, #0xa0]\n"
+      "str q16, [x27, #0xb0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d17, [x28], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "ldr d16, [x26], #0x8\n"
+      "zip1 v20.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x25], #0x8\n"
+      "cmp x19, #0x4\n"
+      "ldr d16, [x24], #0x8\n"
+      "zip1 v19.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "zip1 v18.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q20, [x27, #0x0]\n"
+      "str q19, [x27, #0x30]\n"
+      "str q18, [x27, #0x60]\n"
+      "str q16, [x27, #0x90]\n"
+      "add x27, x27, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h17, [x28], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "ldr h16, [x26], #0x2\n"
+      "zip1 v20.8h, v17.8h, v16.8h\n"
+      "ldr h17, [x25], #0x2\n"
+      "cmp x19, #0x1\n"
+      "ldr h16, [x24], #0x2\n"
+      "zip1 v19.8h, v17.8h, v16.8h\n"
+      "ldr h17, [x23], #0x2\n"
+      "ldr h16, [x22], #0x2\n"
+      "zip1 v18.8h, v17.8h, v16.8h\n"
+      "ldr h17, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str s20, [x27, #0x0]\n"
+      "str s19, [x27, #0x30]\n"
+      "str s18, [x27, #0x60]\n"
+      "str s16, [x27, #0x90]\n"
+      "add x27, x27, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0xc0\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q17, [x28], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v22.8h, v17.8h, v16.8h\n"
+      "ldr q18, [x28], #0x10\n"
+      "cmp x19, #0x18\n"
+      "zip2 v21.8h, v17.8h, v16.8h\n"
+      "ldr q17, [x26], #0x10\n"
+      "ldr q20, [x28], #0x10\n"
+      "zip1 v19.8h, v18.8h, v17.8h\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip2 v18.8h, v18.8h, v17.8h\n"
+      "str q22, [x27, #0x0]\n"
+      "str q21, [x27, #0x10]\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "str q19, [x27, #0x20]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q18, [x27, #0x0]\n"
+      "str q17, [x27, #0x10]\n"
+      "str q16, [x27, #0x20]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q17, [x28], #0x10\n"
+      "sub x19, x19, #0xc\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v19.8h, v17.8h, v16.8h\n"
+      "ldr d18, [x28], #0x8\n"
+      "cmp x19, #0xc\n"
+      "zip2 v17.8h, v17.8h, v16.8h\n"
+      "ldr d16, [x26], #0x8\n"
+      "str q19, [x27, #0x0]\n"
+      "zip1 v16.8h, v18.8h, v16.8h\n"
+      "str q17, [x27, #0x10]\n"
+      "str q16, [x27, #0x20]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d17, [x28], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "ldr d16, [x26], #0x8\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "cmp x19, #0x4\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h17, [x28], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "ldr h16, [x26], #0x2\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str s16, [x27, #0x0]\n"
+      "add x27, x27, #0x4\n"
+      "cmp x19, #0x1\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x30\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<12, 2, true, VLType::None>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
new file mode 100644
index 0000000000..7e8ca6648d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4.hpp
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_12_2x4(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 12 * roundup<size_t>(height, 4) * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q18, [x28], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "ldr q23, [x26], #0x10\n"
+      "cmp x19, #0x18\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v22.8h, v18.8h, v16.8h\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v21.8h, v18.8h, v16.8h\n"
+      "ldr q12, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v20.8h, v17.8h, v16.8h\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v11.8h, v17.8h, v16.8h\n"
+      "ldr q10, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v9.8h, v18.8h, v17.8h\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v8.8h, v18.8h, v17.8h\n"
+      "ldr q19, [x23], #0x10\n"
+      "ldr q7, [x22], #0x10\n"
+      "zip1 v17.8h, v23.8h, v16.8h\n"
+      "ldr q6, [x24], #0x10\n"
+      "zip2 v16.8h, v23.8h, v16.8h\n"
+      "ldr q5, [x23], #0x10\n"
+      "zip1 v4.8h, v22.8h, v17.8h\n"
+      "ldr q3, [x22], #0x10\n"
+      "zip2 v2.8h, v22.8h, v17.8h\n"
+      "ldr q18, [x21], #0x10\n"
+      "zip1 v1.8h, v21.8h, v16.8h\n"
+      "ldr q0, [x24], #0x10\n"
+      "zip2 v31.8h, v21.8h, v16.8h\n"
+      "ldr q30, [x23], #0x10\n"
+      "zip1 v16.8h, v12.8h, v6.8h\n"
+      "ldr q29, [x22], #0x10\n"
+      "zip1 v28.8h, v20.8h, v16.8h\n"
+      "ldr q27, [x21], #0x10\n"
+      "zip2 v26.8h, v20.8h, v16.8h\n"
+      "ldr q21, [x20], #0x10\n"
+      "zip1 v17.8h, v19.8h, v18.8h\n"
+      "ldr q25, [x21], #0x10\n"
+      "zip2 v19.8h, v19.8h, v18.8h\n"
+      "zip1 v18.8h, v5.8h, v27.8h\n"
+      "ldr q24, [x20], #0x10\n"
+      "zip1 v16.8h, v7.8h, v21.8h\n"
+      "ldr q23, [x20], #0x10\n"
+      "zip1 v22.8h, v17.8h, v16.8h\n"
+      "zip2 v20.8h, v17.8h, v16.8h\n"
+      "str q4, [x27, #0x0]\n"
+      "zip2 v16.8h, v7.8h, v21.8h\n"
+      "str q2, [x27, #0x10]\n"
+      "zip1 v17.8h, v19.8h, v16.8h\n"
+      "str q1, [x27, #0x20]\n"
+      "zip2 v21.8h, v19.8h, v16.8h\n"
+      "str q31, [x27, #0x30]\n"
+      "zip1 v16.8h, v3.8h, v24.8h\n"
+      "str q28, [x27, #0x40]\n"
+      "zip1 v19.8h, v18.8h, v16.8h\n"
+      "str q26, [x27, #0x50]\n"
+      "zip2 v18.8h, v18.8h, v16.8h\n"
+      "str q22, [x27, #0x60]\n"
+      "zip2 v16.8h, v12.8h, v6.8h\n"
+      "str q20, [x27, #0x70]\n"
+      "zip1 v20.8h, v11.8h, v16.8h\n"
+      "str q17, [x27, #0x80]\n"
+      "zip2 v17.8h, v11.8h, v16.8h\n"
+      "str q21, [x27, #0x90]\n"
+      "zip1 v16.8h, v10.8h, v0.8h\n"
+      "str q19, [x27, #0xa0]\n"
+      "zip1 v19.8h, v9.8h, v16.8h\n"
+      "str q18, [x27, #0xb0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip2 v18.8h, v9.8h, v16.8h\n"
+      "str q20, [x27, #0x0]\n"
+      "zip2 v16.8h, v10.8h, v0.8h\n"
+      "str q17, [x27, #0x10]\n"
+      "zip1 v17.8h, v8.8h, v16.8h\n"
+      "str q19, [x27, #0x20]\n"
+      "zip2 v16.8h, v8.8h, v16.8h\n"
+      "str q18, [x27, #0x30]\n"
+      "zip2 v18.8h, v5.8h, v27.8h\n"
+      "str q17, [x27, #0x40]\n"
+      "zip2 v17.8h, v3.8h, v24.8h\n"
+      "str q16, [x27, #0x50]\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0x60]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0x70]\n"
+      "zip1 v18.8h, v30.8h, v25.8h\n"
+      "zip1 v17.8h, v29.8h, v23.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0x80]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0x90]\n"
+      "zip2 v18.8h, v30.8h, v25.8h\n"
+      "zip2 v17.8h, v29.8h, v23.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xa0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xb0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q18, [x28], #0x10\n"
+      "sub x19, x19, #0xc\n"
+      "ldr q20, [x26], #0x10\n"
+      "cmp x19, #0xc\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v19.8h, v18.8h, v16.8h\n"
+      "ldr d17, [x28], #0x8\n"
+      "zip2 v23.8h, v18.8h, v16.8h\n"
+      "ldr d22, [x26], #0x8\n"
+      "ldr d16, [x25], #0x8\n"
+      "zip1 v21.8h, v17.8h, v16.8h\n"
+      "ldr q16, [x24], #0x10\n"
+      "ldr q31, [x23], #0x10\n"
+      "zip1 v18.8h, v20.8h, v16.8h\n"
+      "ldr d17, [x24], #0x8\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "ldr d30, [x23], #0x8\n"
+      "zip1 v29.8h, v19.8h, v18.8h\n"
+      "ldr q28, [x22], #0x10\n"
+      "zip2 v20.8h, v19.8h, v18.8h\n"
+      "ldr q27, [x21], #0x10\n"
+      "zip1 v19.8h, v23.8h, v16.8h\n"
+      "ldr q26, [x20], #0x10\n"
+      "zip2 v18.8h, v23.8h, v16.8h\n"
+      "ldr d25, [x22], #0x8\n"
+      "zip1 v16.8h, v22.8h, v17.8h\n"
+      "zip1 v24.8h, v21.8h, v16.8h\n"
+      "ldr d23, [x21], #0x8\n"
+      "zip2 v22.8h, v21.8h, v16.8h\n"
+      "ldr d21, [x20], #0x8\n"
+      "zip1 v17.8h, v31.8h, v27.8h\n"
+      "str q29, [x27, #0x0]\n"
+      "zip1 v16.8h, v28.8h, v26.8h\n"
+      "str q20, [x27, #0x10]\n"
+      "zip1 v20.8h, v17.8h, v16.8h\n"
+      "str q19, [x27, #0x20]\n"
+      "zip2 v19.8h, v17.8h, v16.8h\n"
+      "str q18, [x27, #0x30]\n"
+      "zip2 v18.8h, v31.8h, v27.8h\n"
+      "str q24, [x27, #0x40]\n"
+      "zip2 v16.8h, v28.8h, v26.8h\n"
+      "str q22, [x27, #0x50]\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "str q20, [x27, #0x60]\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "str q19, [x27, #0x70]\n"
+      "zip1 v18.8h, v30.8h, v23.8h\n"
+      "str q17, [x27, #0x80]\n"
+      "zip1 v17.8h, v25.8h, v21.8h\n"
+      "str q16, [x27, #0x90]\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xa0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xb0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d17, [x28], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "ldr d18, [x26], #0x8\n"
+      "cmp x19, #0x4\n"
+      "ldr d16, [x25], #0x8\n"
+      "zip1 v17.8h, v17.8h, v16.8h\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "zip1 v16.8h, v18.8h, v16.8h\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "zip1 v18.8h, v17.8h, v16.8h\n"
+      "zip2 v17.8h, v17.8h, v16.8h\n"
+      "ldr d16, [x20], #0x8\n"
+      "str q18, [x27, #0x0]\n"
+      "zip1 v18.8h, v21.8h, v19.8h\n"
+      "str q17, [x27, #0x10]\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0x60]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0x70]\n"
+      "add x27, x27, #0x20\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h18, [x28], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "ldr h17, [x26], #0x2\n"
+      "cmp x19, #0x1\n"
+      "ldr h16, [x25], #0x2\n"
+      "zip1 v18.8h, v18.8h, v16.8h\n"
+      "ldr h16, [x24], #0x2\n"
+      "ldr h20, [x23], #0x2\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "ldr h19, [x22], #0x2\n"
+      "ldr h17, [x21], #0x2\n"
+      "zip1 v18.8h, v18.8h, v16.8h\n"
+      "ldr h16, [x20], #0x2\n"
+      "zip1 v17.8h, v20.8h, v17.8h\n"
+      "str d18, [x27, #0x0]\n"
+      "zip1 v16.8h, v19.8h, v16.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str d16, [x27, #0x60]\n"
+      "add x27, x27, #0x8\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0xc0\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q18, [x28], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "ldr q19, [x26], #0x10\n"
+      "cmp x19, #0x18\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v28.8h, v18.8h, v16.8h\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v27.8h, v18.8h, v16.8h\n"
+      "ldr q26, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v25.8h, v17.8h, v16.8h\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v24.8h, v17.8h, v16.8h\n"
+      "ldr q23, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v22.8h, v18.8h, v16.8h\n"
+      "ldr q17, [x24], #0x10\n"
+      "zip2 v21.8h, v18.8h, v16.8h\n"
+      "ldr q20, [x24], #0x10\n"
+      "zip1 v16.8h, v19.8h, v17.8h\n"
+      "zip2 v18.8h, v19.8h, v17.8h\n"
+      "ldr q19, [x24], #0x10\n"
+      "zip1 v17.8h, v28.8h, v16.8h\n"
+      "zip2 v16.8h, v28.8h, v16.8h\n"
+      "str q17, [x27, #0x0]\n"
+      "zip1 v17.8h, v27.8h, v18.8h\n"
+      "str q16, [x27, #0x10]\n"
+      "zip2 v16.8h, v27.8h, v18.8h\n"
+      "str q17, [x27, #0x20]\n"
+      "zip1 v17.8h, v26.8h, v20.8h\n"
+      "str q16, [x27, #0x30]\n"
+      "zip1 v16.8h, v25.8h, v17.8h\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.8h, v25.8h, v17.8h\n"
+      "str q16, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip2 v18.8h, v26.8h, v20.8h\n"
+      "zip1 v17.8h, v23.8h, v19.8h\n"
+      "zip1 v16.8h, v24.8h, v18.8h\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.8h, v24.8h, v18.8h\n"
+      "str q16, [x27, #0x10]\n"
+      "zip1 v16.8h, v22.8h, v17.8h\n"
+      "str q16, [x27, #0x20]\n"
+      "zip2 v16.8h, v22.8h, v17.8h\n"
+      "str q16, [x27, #0x30]\n"
+      "zip2 v17.8h, v23.8h, v19.8h\n"
+      "zip1 v16.8h, v21.8h, v17.8h\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.8h, v21.8h, v17.8h\n"
+      "str q16, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q18, [x28], #0x10\n"
+      "sub x19, x19, #0xc\n"
+      "ldr q24, [x26], #0x10\n"
+      "cmp x19, #0xc\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v23.8h, v18.8h, v16.8h\n"
+      "ldr d17, [x28], #0x8\n"
+      "zip2 v22.8h, v18.8h, v16.8h\n"
+      "ldr d21, [x26], #0x8\n"
+      "ldr d16, [x25], #0x8\n"
+      "zip1 v20.8h, v17.8h, v16.8h\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip1 v19.8h, v24.8h, v16.8h\n"
+      "ldr d18, [x24], #0x8\n"
+      "zip2 v17.8h, v24.8h, v16.8h\n"
+      "zip1 v16.8h, v23.8h, v19.8h\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.8h, v23.8h, v19.8h\n"
+      "str q16, [x27, #0x10]\n"
+      "zip1 v16.8h, v22.8h, v17.8h\n"
+      "str q16, [x27, #0x20]\n"
+      "zip2 v16.8h, v22.8h, v17.8h\n"
+      "str q16, [x27, #0x30]\n"
+      "zip1 v17.8h, v21.8h, v18.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d18, [x28], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "ldr d17, [x26], #0x8\n"
+      "cmp x19, #0x4\n"
+      "ldr d16, [x25], #0x8\n"
+      "zip1 v18.8h, v18.8h, v16.8h\n"
+      "ldr d16, [x24], #0x8\n"
+      "zip1 v17.8h, v17.8h, v16.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0x10]\n"
+      "add x27, x27, #0x20\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h17, [x28], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "ldr h18, [x26], #0x2\n"
+      "cmp x19, #0x1\n"
+      "ldr h16, [x25], #0x2\n"
+      "zip1 v17.8h, v17.8h, v16.8h\n"
+      "ldr h16, [x24], #0x2\n"
+      "zip1 v16.8h, v18.8h, v16.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str d16, [x27, #0x0]\n"
+      "add x27, x27, #0x8\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x60\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<12, 4, true, VLType::None>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_2x4(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..efb1c742ed
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_2x4_fp32bf16.hpp
@@ -0,0 +1,735 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_12_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 12 * roundup<size_t>(height, 4) * sizeof(bfloat16);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q12, [x28], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "ldr q20, [x26], #0x10\n"
+      "cmp x19, #0x18\n"
+      "ldr q11, [x25], #0x10\n"
+      "zip1 v29.4s, v12.4s, v11.4s\n"
+      "ldr q5, [x28], #0x10\n"
+      "zip2 v0.4s, v12.4s, v11.4s\n"
+      "ldr q28, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v23.4s, v5.4s, v17.4s\n"
+      "ldr q25, [x28], #0x10\n"
+      "zip2 v18.4s, v5.4s, v17.4s\n"
+      "ldr q6, [x26], #0x10\n"
+      "ldr q31, [x25], #0x10\n"
+      "zip1 v21.4s, v25.4s, v31.4s\n"
+      "ldr q16, [x28], #0x10\n"
+      "zip2 v10.4s, v25.4s, v31.4s\n"
+      "ldr q11, [x26], #0x10\n"
+      "ldr q1, [x25], #0x10\n"
+      "zip1 v13.4s, v16.4s, v1.4s\n"
+      "ldr q14, [x28], #0x10\n"
+      "zip2 v24.4s, v16.4s, v1.4s\n"
+      "ldr q4, [x26], #0x10\n"
+      "ldr q22, [x25], #0x10\n"
+      "zip1 v1.4s, v14.4s, v22.4s\n"
+      "ldr q15, [x28], #0x10\n"
+      "zip2 v8.4s, v14.4s, v22.4s\n"
+      "ldr q31, [x26], #0x10\n"
+      "ldr q3, [x25], #0x10\n"
+      "zip1 v27.4s, v15.4s, v3.4s\n"
+      "ldr q30, [x24], #0x10\n"
+      "zip2 v22.4s, v15.4s, v3.4s\n"
+      "ldr q15, [x23], #0x10\n"
+      "ldr q5, [x22], #0x10\n"
+      "zip1 v16.4s, v20.4s, v30.4s\n"
+      "ldr q3, [x24], #0x10\n"
+      "zip2 v7.4s, v20.4s, v30.4s\n"
+      "ldr q26, [x23], #0x10\n"
+      "zip1 v12.4s, v29.4s, v16.4s\n"
+      "ldr q25, [x22], #0x10\n"
+      ".inst 0x0ea16994  // bfcvtn v20.4h, v12.4s\n"
+      "ldr q2, [x21], #0x10\n"
+      "zip2 v16.4s, v29.4s, v16.4s\n"
+      "ldr q19, [x24], #0x10\n"
+      "zip1 v12.4s, v0.4s, v7.4s\n"
+      "ldr q9, [x23], #0x10\n"
+      ".inst 0x4ea16a14  // bfcvtn2 v20.8h, v16.4s\n"
+      "ldr q14, [x22], #0x10\n"
+      ".inst 0x0ea1699e  // bfcvtn v30.4h, v12.4s\n"
+      "ldr q12, [x21], #0x10\n"
+      "zip2 v16.4s, v0.4s, v7.4s\n"
+      "ldr q7, [x24], #0x10\n"
+      "zip1 v29.4s, v28.4s, v3.4s\n"
+      "ldr q0, [x23], #0x10\n"
+      ".inst 0x4ea16a1e  // bfcvtn2 v30.8h, v16.4s\n"
+      "ldr q17, [x22], #0x10\n"
+      "zip1 v16.4s, v23.4s, v29.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v23.4s, v23.4s, v29.4s\n"
+      "ldr q29, [x24], #0x10\n"
+      "zip2 v28.4s, v28.4s, v3.4s\n"
+      "ldr q3, [x23], #0x10\n"
+      ".inst 0x4ea16af0  // bfcvtn2 v16.8h, v23.4s\n"
+      "zip1 v23.4s, v18.4s, v28.4s\n"
+      ".inst 0x0ea16af7  // bfcvtn v23.4h, v23.4s\n"
+      "zip2 v28.4s, v18.4s, v28.4s\n"
+      "ldr q18, [x24], #0x10\n"
+      ".inst 0x4ea16b97  // bfcvtn2 v23.8h, v28.4s\n"
+      "zip1 v28.4s, v6.4s, v19.4s\n"
+      "zip2 v6.4s, v6.4s, v19.4s\n"
+      "zip1 v19.4s, v21.4s, v28.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      "zip2 v28.4s, v21.4s, v28.4s\n"
+      "ldr q21, [x23], #0x10\n"
+      ".inst 0x4ea16b93  // bfcvtn2 v19.8h, v28.4s\n"
+      "zip1 v28.4s, v10.4s, v6.4s\n"
+      ".inst 0x0ea16b9c  // bfcvtn v28.4h, v28.4s\n"
+      "zip2 v6.4s, v10.4s, v6.4s\n"
+      "ldr q10, [x22], #0x10\n"
+      ".inst 0x4ea168dc  // bfcvtn2 v28.8h, v6.4s\n"
+      "zip1 v6.4s, v11.4s, v7.4s\n"
+      "zip2 v7.4s, v11.4s, v7.4s\n"
+      "zip1 v11.4s, v13.4s, v6.4s\n"
+      ".inst 0x0ea1696b  // bfcvtn v11.4h, v11.4s\n"
+      "zip2 v13.4s, v13.4s, v6.4s\n"
+      "ldr q6, [x22], #0x10\n"
+      ".inst 0x4ea169ab  // bfcvtn2 v11.8h, v13.4s\n"
+      "zip1 v13.4s, v24.4s, v7.4s\n"
+      ".inst 0x0ea169ad  // bfcvtn v13.4h, v13.4s\n"
+      "zip2 v7.4s, v24.4s, v7.4s\n"
+      "ldr q24, [x21], #0x10\n"
+      ".inst 0x4ea168ed  // bfcvtn2 v13.8h, v7.4s\n"
+      "zip1 v7.4s, v4.4s, v29.4s\n"
+      "zip2 v29.4s, v4.4s, v29.4s\n"
+      "zip1 v4.4s, v1.4s, v7.4s\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      "zip2 v7.4s, v1.4s, v7.4s\n"
+      "ldr q1, [x21], #0x10\n"
+      ".inst 0x4ea168e4  // bfcvtn2 v4.8h, v7.4s\n"
+      "zip1 v7.4s, v8.4s, v29.4s\n"
+      ".inst 0x0ea168e7  // bfcvtn v7.4h, v7.4s\n"
+      "zip2 v8.4s, v8.4s, v29.4s\n"
+      "ldr q29, [x21], #0x10\n"
+      ".inst 0x4ea16907  // bfcvtn2 v7.8h, v8.4s\n"
+      "zip1 v8.4s, v31.4s, v18.4s\n"
+      "zip2 v31.4s, v31.4s, v18.4s\n"
+      "zip1 v18.4s, v27.4s, v8.4s\n"
+      ".inst 0x0ea16a52  // bfcvtn v18.4h, v18.4s\n"
+      "zip2 v27.4s, v27.4s, v8.4s\n"
+      "ldr q8, [x21], #0x10\n"
+      ".inst 0x4ea16b72  // bfcvtn2 v18.8h, v27.4s\n"
+      "zip1 v27.4s, v22.4s, v31.4s\n"
+      ".inst 0x0ea16b7b  // bfcvtn v27.4h, v27.4s\n"
+      "zip2 v31.4s, v22.4s, v31.4s\n"
+      "ldr q22, [x20], #0x10\n"
+      ".inst 0x4ea16bfb  // bfcvtn2 v27.8h, v31.4s\n"
+      "zip1 v31.4s, v15.4s, v2.4s\n"
+      "zip2 v2.4s, v15.4s, v2.4s\n"
+      "zip1 v15.4s, v26.4s, v12.4s\n"
+      "zip2 v26.4s, v26.4s, v12.4s\n"
+      "zip1 v12.4s, v5.4s, v22.4s\n"
+      "zip2 v22.4s, v5.4s, v22.4s\n"
+      "zip1 v5.4s, v31.4s, v12.4s\n"
+      ".inst 0x0ea168a5  // bfcvtn v5.4h, v5.4s\n"
+      "zip2 v31.4s, v31.4s, v12.4s\n"
+      "ldr q12, [x20], #0x10\n"
+      ".inst 0x4ea16be5  // bfcvtn2 v5.8h, v31.4s\n"
+      "zip1 v31.4s, v2.4s, v22.4s\n"
+      ".inst 0x0ea16bff  // bfcvtn v31.4h, v31.4s\n"
+      "zip2 v2.4s, v2.4s, v22.4s\n"
+      "ldr q22, [x20], #0x10\n"
+      ".inst 0x4ea1685f  // bfcvtn2 v31.8h, v2.4s\n"
+      "zip1 v2.4s, v25.4s, v12.4s\n"
+      "zip2 v25.4s, v25.4s, v12.4s\n"
+      "zip1 v12.4s, v15.4s, v2.4s\n"
+      ".inst 0x0ea1698c  // bfcvtn v12.4h, v12.4s\n"
+      "zip2 v15.4s, v15.4s, v2.4s\n"
+      "ldr q2, [x20], #0x10\n"
+      ".inst 0x4ea169ec  // bfcvtn2 v12.8h, v15.4s\n"
+      "zip1 v15.4s, v26.4s, v25.4s\n"
+      ".inst 0x0ea169ef  // bfcvtn v15.4h, v15.4s\n"
+      "zip2 v25.4s, v26.4s, v25.4s\n"
+      "ldr q26, [x20], #0x10\n"
+      ".inst 0x4ea16b2f  // bfcvtn2 v15.8h, v25.4s\n"
+      "ldr q25, [x20], #0x10\n"
+      "str q20, [x27, #0x0]\n"
+      "zip1 v20.4s, v9.4s, v24.4s\n"
+      "zip2 v24.4s, v9.4s, v24.4s\n"
+      "str q30, [x27, #0x10]\n"
+      "zip1 v9.4s, v14.4s, v22.4s\n"
+      "str q16, [x27, #0x20]\n"
+      "zip1 v16.4s, v20.4s, v9.4s\n"
+      "str q23, [x27, #0x30]\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str q19, [x27, #0x40]\n"
+      "zip2 v9.4s, v20.4s, v9.4s\n"
+      "str q28, [x27, #0x50]\n"
+      "zip2 v22.4s, v14.4s, v22.4s\n"
+      "str q5, [x27, #0x60]\n"
+      ".inst 0x4ea16930  // bfcvtn2 v16.8h, v9.4s\n"
+      "str q31, [x27, #0x70]\n"
+      "zip1 v19.4s, v24.4s, v22.4s\n"
+      "str q12, [x27, #0x80]\n"
+      ".inst 0x0ea16a6c  // bfcvtn v12.4h, v19.4s\n"
+      "str q15, [x27, #0x90]\n"
+      "zip2 v9.4s, v24.4s, v22.4s\n"
+      "str q16, [x27, #0xa0]\n"
+      "zip1 v15.4s, v0.4s, v1.4s\n"
+      ".inst 0x4ea1692c  // bfcvtn2 v12.8h, v9.4s\n"
+      "str q12, [x27, #0xb0]\n"
+      "zip1 v20.4s, v17.4s, v2.4s\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip1 v16.4s, v15.4s, v20.4s\n"
+      "str q11, [x27, #0x0]\n"
+      "zip2 v9.4s, v15.4s, v20.4s\n"
+      "str q13, [x27, #0x10]\n"
+      ".inst 0x0ea16a0f  // bfcvtn v15.4h, v16.4s\n"
+      "str q4, [x27, #0x20]\n"
+      "zip2 v14.4s, v0.4s, v1.4s\n"
+      "str q7, [x27, #0x30]\n"
+      "zip2 v31.4s, v17.4s, v2.4s\n"
+      "str q18, [x27, #0x40]\n"
+      ".inst 0x4ea1692f  // bfcvtn2 v15.8h, v9.4s\n"
+      "str q27, [x27, #0x50]\n"
+      "zip1 v22.4s, v14.4s, v31.4s\n"
+      "str q15, [x27, #0x60]\n"
+      ".inst 0x0ea16ac9  // bfcvtn v9.4h, v22.4s\n"
+      "zip2 v11.4s, v14.4s, v31.4s\n"
+      "zip1 v18.4s, v3.4s, v29.4s\n"
+      "zip1 v27.4s, v10.4s, v26.4s\n"
+      ".inst 0x4ea16969  // bfcvtn2 v9.8h, v11.4s\n"
+      "str q9, [x27, #0x70]\n"
+      "zip1 v13.4s, v18.4s, v27.4s\n"
+      "zip2 v9.4s, v18.4s, v27.4s\n"
+      ".inst 0x0ea169b3  // bfcvtn v19.4h, v13.4s\n"
+      "zip2 v18.4s, v3.4s, v29.4s\n"
+      "zip2 v1.4s, v10.4s, v26.4s\n"
+      ".inst 0x4ea16933  // bfcvtn2 v19.8h, v9.4s\n"
+      "str q19, [x27, #0x80]\n"
+      "zip1 v16.4s, v18.4s, v1.4s\n"
+      "zip2 v20.4s, v18.4s, v1.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip1 v18.4s, v21.4s, v8.4s\n"
+      "zip1 v2.4s, v6.4s, v25.4s\n"
+      ".inst 0x4ea16a90  // bfcvtn2 v16.8h, v20.4s\n"
+      "str q16, [x27, #0x90]\n"
+      "zip1 v16.4s, v18.4s, v2.4s\n"
+      "zip2 v20.4s, v18.4s, v2.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v21.4s, v8.4s\n"
+      "zip2 v17.4s, v6.4s, v25.4s\n"
+      ".inst 0x4ea16a90  // bfcvtn2 v16.8h, v20.4s\n"
+      "str q16, [x27, #0xa0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v17.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x4ea16a30  // bfcvtn2 v16.8h, v17.4s\n"
+      "str q16, [x27, #0xb0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q18, [x28], #0x10\n"
+      "sub x19, x19, #0xc\n"
+      "ldr q21, [x26], #0x10\n"
+      "cmp x19, #0xc\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v19.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v20.4s, v18.4s, v16.4s\n"
+      "ldr q8, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v7.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v6.4s, v17.4s, v16.4s\n"
+      "ldr q5, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v4.4s, v18.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v3.4s, v18.4s, v17.4s\n"
+      "ldr q2, [x23], #0x10\n"
+      "ldr q1, [x22], #0x10\n"
+      "zip1 v17.4s, v21.4s, v16.4s\n"
+      "ldr q0, [x24], #0x10\n"
+      "zip2 v18.4s, v21.4s, v16.4s\n"
+      "ldr q31, [x23], #0x10\n"
+      "zip1 v16.4s, v19.4s, v17.4s\n"
+      "ldr q30, [x22], #0x10\n"
+      ".inst 0x0ea16a1d  // bfcvtn v29.4h, v16.4s\n"
+      "ldr q28, [x21], #0x10\n"
+      "zip2 v17.4s, v19.4s, v17.4s\n"
+      "ldr q27, [x24], #0x10\n"
+      "zip1 v16.4s, v20.4s, v18.4s\n"
+      "ldr q26, [x23], #0x10\n"
+      ".inst 0x4ea16a3d  // bfcvtn2 v29.8h, v17.4s\n"
+      "ldr q25, [x22], #0x10\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "ldr q24, [x21], #0x10\n"
+      "zip2 v16.4s, v20.4s, v18.4s\n"
+      "ldr q23, [x20], #0x10\n"
+      "zip1 v17.4s, v8.4s, v0.4s\n"
+      "ldr q22, [x21], #0x10\n"
+      ".inst 0x4ea16a13  // bfcvtn2 v19.8h, v16.4s\n"
+      "zip1 v16.4s, v7.4s, v17.4s\n"
+      "ldr q21, [x20], #0x10\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "ldr q20, [x20], #0x10\n"
+      "zip2 v16.4s, v7.4s, v17.4s\n"
+      "zip2 v17.4s, v8.4s, v0.4s\n"
+      "str q29, [x27, #0x0]\n"
+      ".inst 0x4ea16a12  // bfcvtn2 v18.8h, v16.4s\n"
+      "str q19, [x27, #0x10]\n"
+      "zip1 v16.4s, v6.4s, v17.4s\n"
+      "str q18, [x27, #0x20]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v6.4s, v17.4s\n"
+      "zip1 v17.4s, v5.4s, v27.4s\n"
+      "zip1 v16.4s, v4.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x30]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v4.4s, v17.4s\n"
+      "zip2 v17.4s, v5.4s, v27.4s\n"
+      "zip1 v16.4s, v3.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x40]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v16.4s, v3.4s, v17.4s\n"
+      "zip1 v18.4s, v2.4s, v28.4s\n"
+      "zip1 v17.4s, v1.4s, v23.4s\n"
+      ".inst 0x4ea16a13  // bfcvtn2 v19.8h, v16.4s\n"
+      "str q19, [x27, #0x50]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v2.4s, v28.4s\n"
+      "zip2 v17.4s, v1.4s, v23.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0x60]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip1 v18.4s, v31.4s, v24.4s\n"
+      "zip1 v17.4s, v30.4s, v21.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0x70]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v31.4s, v24.4s\n"
+      "zip2 v17.4s, v30.4s, v21.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0x80]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip1 v18.4s, v26.4s, v22.4s\n"
+      "zip1 v17.4s, v25.4s, v20.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0x90]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v26.4s, v22.4s\n"
+      "zip2 v17.4s, v25.4s, v20.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0xa0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v17.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x4ea16a30  // bfcvtn2 v16.8h, v17.4s\n"
+      "str q16, [x27, #0xb0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr q20, [x28], #0x10\n"
+      "sub x19, x19, #0x4\n"
+      "ldr q18, [x26], #0x10\n"
+      "cmp x19, #0x4\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v19.4s, v20.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v25.4s, v20.4s, v17.4s\n"
+      "ldr q24, [x23], #0x10\n"
+      "ldr q23, [x22], #0x10\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "ldr q22, [x21], #0x10\n"
+      "zip2 v21.4s, v18.4s, v16.4s\n"
+      "ldr q20, [x20], #0x10\n"
+      "zip1 v16.4s, v19.4s, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip2 v17.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v25.4s, v21.4s\n"
+      ".inst 0x4ea16a32  // bfcvtn2 v18.8h, v17.4s\n"
+      "str q18, [x27, #0x0]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v16.4s, v25.4s, v21.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v20.4s\n"
+      ".inst 0x4ea16a13  // bfcvtn2 v19.8h, v16.4s\n"
+      "str q19, [x27, #0x10]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v20.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0x60]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v17.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x4ea16a30  // bfcvtn2 v16.8h, v17.4s\n"
+      "str q16, [x27, #0x70]\n"
+      "add x27, x27, #0x20\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr s18, [x28], #0x4\n"
+      "sub x19, x19, #0x1\n"
+      "ldr s17, [x26], #0x4\n"
+      "cmp x19, #0x1\n"
+      "ldr s16, [x25], #0x4\n"
+      "zip1 v18.4s, v18.4s, v16.4s\n"
+      "ldr s16, [x24], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s17, [x21], #0x4\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "ldr s18, [x20], #0x4\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip1 v17.4s, v20.4s, v17.4s\n"
+      "str d16, [x27, #0x0]\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d16, [x27, #0x60]\n"
+      "add x27, x27, #0x8\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0xc0\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q17, [x28], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "ldr q20, [x26], #0x10\n"
+      "cmp x19, #0x18\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v19.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v9.4s, v17.4s, v16.4s\n"
+      "ldr q8, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v7.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v6.4s, v18.4s, v16.4s\n"
+      "ldr q5, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v4.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v3.4s, v17.4s, v16.4s\n"
+      "ldr q2, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v1.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v0.4s, v18.4s, v16.4s\n"
+      "ldr q31, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v30.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v29.4s, v17.4s, v16.4s\n"
+      "ldr q28, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v27.4s, v18.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v26.4s, v18.4s, v17.4s\n"
+      "ldr q25, [x24], #0x10\n"
+      "zip1 v17.4s, v20.4s, v16.4s\n"
+      "zip2 v24.4s, v20.4s, v16.4s\n"
+      "ldr q23, [x24], #0x10\n"
+      "zip1 v16.4s, v19.4s, v17.4s\n"
+      "zip2 v17.4s, v19.4s, v17.4s\n"
+      "ldr q22, [x24], #0x10\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip1 v16.4s, v9.4s, v24.4s\n"
+      "ldr q21, [x24], #0x10\n"
+      ".inst 0x4ea16a33  // bfcvtn2 v19.8h, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "ldr q20, [x24], #0x10\n"
+      "zip2 v16.4s, v9.4s, v24.4s\n"
+      "zip1 v17.4s, v8.4s, v25.4s\n"
+      "str q19, [x27, #0x0]\n"
+      ".inst 0x4ea16a12  // bfcvtn2 v18.8h, v16.4s\n"
+      "str q18, [x27, #0x10]\n"
+      "zip1 v16.4s, v7.4s, v17.4s\n"
+      "zip2 v19.4s, v7.4s, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip2 v17.4s, v8.4s, v25.4s\n"
+      "zip1 v16.4s, v6.4s, v17.4s\n"
+      ".inst 0x4ea16a72  // bfcvtn2 v18.8h, v19.4s\n"
+      "str q18, [x27, #0x20]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v6.4s, v17.4s\n"
+      "zip1 v17.4s, v5.4s, v23.4s\n"
+      "zip1 v16.4s, v4.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x30]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v4.4s, v17.4s\n"
+      "zip2 v17.4s, v5.4s, v23.4s\n"
+      "zip1 v16.4s, v3.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x40]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v3.4s, v17.4s\n"
+      "zip1 v17.4s, v2.4s, v22.4s\n"
+      "zip1 v16.4s, v1.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x50]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip2 v18.4s, v1.4s, v17.4s\n"
+      "zip2 v17.4s, v2.4s, v22.4s\n"
+      "zip1 v16.4s, v0.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x0]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v0.4s, v17.4s\n"
+      "zip1 v17.4s, v31.4s, v21.4s\n"
+      "zip1 v16.4s, v30.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x10]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v30.4s, v17.4s\n"
+      "zip2 v17.4s, v31.4s, v21.4s\n"
+      "zip1 v16.4s, v29.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x20]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v29.4s, v17.4s\n"
+      "zip1 v17.4s, v28.4s, v20.4s\n"
+      "zip1 v16.4s, v27.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x30]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v17.4s, v27.4s, v17.4s\n"
+      "zip2 v18.4s, v28.4s, v20.4s\n"
+      "zip1 v16.4s, v26.4s, v18.4s\n"
+      ".inst 0x4ea16a33  // bfcvtn2 v19.8h, v17.4s\n"
+      "str q19, [x27, #0x40]\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v26.4s, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q17, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q18, [x28], #0x10\n"
+      "sub x19, x19, #0xc\n"
+      "ldr q20, [x26], #0x10\n"
+      "cmp x19, #0xc\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v19.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v29.4s, v18.4s, v16.4s\n"
+      "ldr q28, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v27.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v26.4s, v17.4s, v16.4s\n"
+      "ldr q25, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v24.4s, v18.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v23.4s, v18.4s, v17.4s\n"
+      "ldr q22, [x24], #0x10\n"
+      "zip1 v17.4s, v20.4s, v16.4s\n"
+      "zip2 v21.4s, v20.4s, v16.4s\n"
+      "ldr q20, [x24], #0x10\n"
+      "zip1 v16.4s, v19.4s, v17.4s\n"
+      "zip2 v19.4s, v19.4s, v17.4s\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip1 v16.4s, v29.4s, v21.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      ".inst 0x4ea16a71  // bfcvtn2 v17.8h, v19.4s\n"
+      "str q17, [x27, #0x0]\n"
+      "zip2 v16.4s, v29.4s, v21.4s\n"
+      "zip1 v17.4s, v28.4s, v22.4s\n"
+      ".inst 0x4ea16a12  // bfcvtn2 v18.8h, v16.4s\n"
+      "str q18, [x27, #0x10]\n"
+      "zip1 v16.4s, v27.4s, v17.4s\n"
+      "zip2 v19.4s, v27.4s, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip2 v17.4s, v28.4s, v22.4s\n"
+      "zip1 v16.4s, v26.4s, v17.4s\n"
+      ".inst 0x4ea16a72  // bfcvtn2 v18.8h, v19.4s\n"
+      "str q18, [x27, #0x20]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v26.4s, v17.4s\n"
+      "zip1 v17.4s, v25.4s, v20.4s\n"
+      "zip1 v16.4s, v24.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x30]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v17.4s, v24.4s, v17.4s\n"
+      "zip2 v18.4s, v25.4s, v20.4s\n"
+      "zip1 v16.4s, v23.4s, v18.4s\n"
+      ".inst 0x4ea16a33  // bfcvtn2 v19.8h, v17.4s\n"
+      "str q19, [x27, #0x40]\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v23.4s, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q17, [x27, #0x50]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr q19, [x28], #0x10\n"
+      "sub x19, x19, #0x4\n"
+      "ldr q18, [x26], #0x10\n"
+      "cmp x19, #0x4\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v21.4s, v19.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v20.4s, v19.4s, v17.4s\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "zip2 v19.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v21.4s, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip2 v17.4s, v21.4s, v17.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      ".inst 0x4ea16a32  // bfcvtn2 v18.8h, v17.4s\n"
+      "str q18, [x27, #0x0]\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q17, [x27, #0x10]\n"
+      "add x27, x27, #0x20\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr s17, [x28], #0x4\n"
+      "sub x19, x19, #0x1\n"
+      "ldr s18, [x26], #0x4\n"
+      "cmp x19, #0x1\n"
+      "ldr s16, [x25], #0x4\n"
+      "zip1 v17.4s, v17.4s, v16.4s\n"
+      "ldr s16, [x24], #0x4\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d16, [x27, #0x0]\n"
+      "add x27, x27, #0x8\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x60\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<12, 4, true, VLType::None>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
new file mode 100644
index 0000000000..7359eea737
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_s8s16.hpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_12_s8s16(int16_t *out, const int8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 12 * height * sizeof(int16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add x22, x24, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q16, [x24], #0x10\n"
+      "sshll v27.8h, v16.8b, #0x0\n"
+      "ldr d17, [x24], #0x8\n"
+      "sub x19, x19, #0x18\n"
+      "sshll2 v16.8h, v16.16b, #0x0\n"
+      "ldr q26, [x22], #0x10\n"
+      "cmp x19, #0x18\n"
+      "dup v20.2d, v16.d[0]\n"
+      "ldr q25, [x21], #0x10\n"
+      "dup v24.2d, v16.d[1]\n"
+      "ldr q23, [x20], #0x10\n"
+      "sshll v16.8h, v17.8b, #0x0\n"
+      "ldr d19, [x22], #0x8\n"
+      "mov v24.d[1], v16.d[0]\n"
+      "dup v22.2d, v16.d[1]\n"
+      "ldr d18, [x21], #0x8\n"
+      "sshll v16.8h, v26.8b, #0x0\n"
+      "ldr d21, [x20], #0x8\n"
+      "mov v20.d[1], v16.d[0]\n"
+      "str q27, [x23, #0x0]\n"
+      "dup v17.2d, v16.d[1]\n"
+      "str q20, [x23, #0x10]\n"
+      "sshll2 v16.8h, v26.16b, #0x0\n"
+      "mov v17.d[1], v16.d[0]\n"
+      "str q17, [x23, #0x20]\n"
+      "mov v22.d[1], v16.d[1]\n"
+      "sshll v20.8h, v19.8b, #0x0\n"
+      "sshll v16.8h, v25.8b, #0x0\n"
+      "str q16, [x23, #0x30]\n"
+      "sshll2 v16.8h, v25.16b, #0x0\n"
+      "dup v17.2d, v16.d[0]\n"
+      "dup v19.2d, v16.d[1]\n"
+      "sshll v16.8h, v18.8b, #0x0\n"
+      "mov v19.d[1], v16.d[0]\n"
+      "dup v18.2d, v16.d[1]\n"
+      "sshll v16.8h, v23.8b, #0x0\n"
+      "mov v17.d[1], v16.d[0]\n"
+      "str q17, [x23, #0x40]\n"
+      "dup v17.2d, v16.d[1]\n"
+      "sshll2 v16.8h, v23.16b, #0x0\n"
+      "mov v17.d[1], v16.d[0]\n"
+      "str q17, [x23, #0x50]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "mov v18.d[1], v16.d[1]\n"
+      "str q24, [x23, #0x0]\n"
+      "sshll v16.8h, v21.8b, #0x0\n"
+      "str q22, [x23, #0x10]\n"
+      "str q20, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q18, [x23, #0x40]\n"
+      "str q16, [x23, #0x50]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr d16, [x24], #0x8\n"
+      "sub x19, x19, #0xc\n"
+      "ldr d21, [x22], #0x8\n"
+      "cmp x19, #0xc\n"
+      "ldr d20, [x21], #0x8\n"
+      "ldr d19, [x20], #0x8\n"
+      "ld1 { v16.s }[2], [x24], #0x4\n"
+      "sshll v17.8h, v16.8b, #0x0\n"
+      "ld1 { v21.s }[2], [x22], #0x4\n"
+      "sshll2 v18.8h, v16.16b, #0x0\n"
+      "ld1 { v20.s }[2], [x21], #0x4\n"
+      "ld1 { v19.s }[2], [x20], #0x4\n"
+      "sshll v16.8h, v21.8b, #0x0\n"
+      "str q17, [x23, #0x0]\n"
+      "sshll2 v17.8h, v21.16b, #0x0\n"
+      "mov v18.d[1], v16.d[0]\n"
+      "str q18, [x23, #0x10]\n"
+      "dup v16.2d, v16.d[1]\n"
+      "mov v16.d[1], v17.d[0]\n"
+      "str q16, [x23, #0x20]\n"
+      "sshll v16.8h, v20.8b, #0x0\n"
+      "str q16, [x23, #0x30]\n"
+      "sshll2 v17.8h, v20.16b, #0x0\n"
+      "sshll v16.8h, v19.8b, #0x0\n"
+      "mov v17.d[1], v16.d[0]\n"
+      "str q17, [x23, #0x40]\n"
+      "dup v17.2d, v16.d[1]\n"
+      "sshll2 v16.8h, v19.16b, #0x0\n"
+      "mov v17.d[1], v16.d[0]\n"
+      "str q17, [x23, #0x50]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr s16, [x24], #0x4\n"
+      "sshll v19.8h, v16.8b, #0x0\n"
+      "ldr s16, [x22], #0x4\n"
+      "sub x19, x19, #0x4\n"
+      "sshll v18.8h, v16.8b, #0x0\n"
+      "ldr s16, [x21], #0x4\n"
+      "cmp x19, #0x4\n"
+      "sshll v17.8h, v16.8b, #0x0\n"
+      "ldr s16, [x20], #0x4\n"
+      "str d19, [x23, #0x0]\n"
+      "sshll v16.8h, v16.8b, #0x0\n"
+      "str d18, [x23, #0x18]\n"
+      "str d17, [x23, #0x30]\n"
+      "str d16, [x23, #0x48]\n"
+      "add x23, x23, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr b16, [x24], #0x1\n"
+      "sshll v19.8h, v16.8b, #0x0\n"
+      "ldr b16, [x22], #0x1\n"
+      "sub x19, x19, #0x1\n"
+      "sshll v18.8h, v16.8b, #0x0\n"
+      "ldr b16, [x21], #0x1\n"
+      "cmp x19, #0x1\n"
+      "sshll v17.8h, v16.8b, #0x0\n"
+      "ldr b16, [x20], #0x1\n"
+      "str h19, [x23, #0x0]\n"
+      "sshll v16.8h, v16.8b, #0x0\n"
+      "str h18, [x23, #0x18]\n"
+      "str h17, [x23, #0x30]\n"
+      "str h16, [x23, #0x48]\n"
+      "add x23, x23, #0x2\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x60\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q17, [x24], #0x10\n"
+      "sshll v16.8h, v17.8b, #0x0\n"
+      "ldr d18, [x24], #0x8\n"
+      "sub x19, x19, #0x18\n"
+      "sshll2 v17.8h, v17.16b, #0x0\n"
+      "str q16, [x23, #0x0]\n"
+      "cmp x19, #0x18\n"
+      "dup v16.2d, v17.d[0]\n"
+      "str d16, [x23, #0x10]\n"
+      "dup v17.2d, v17.d[1]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "sshll v16.8h, v18.8b, #0x0\n"
+      "mov v17.d[1], v16.d[0]\n"
+      "str q17, [x23, #0x0]\n"
+      "dup v16.2d, v16.d[1]\n"
+      "str d16, [x23, #0x10]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr d17, [x24], #0x8\n"
+      "sub x19, x19, #0xc\n"
+      "cmp x19, #0xc\n"
+      "ld1 { v17.s }[2], [x24], #0x4\n"
+      "sshll v16.8h, v17.8b, #0x0\n"
+      "str q16, [x23, #0x0]\n"
+      "sshll2 v16.8h, v17.16b, #0x0\n"
+      "str d16, [x23, #0x10]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr s16, [x24], #0x4\n"
+      "sshll v16.8h, v16.8b, #0x0\n"
+      "str d16, [x23, #0x0]\n"
+      "sub x19, x19, #0x4\n"
+      "add x23, x23, #0x8\n"
+      "cmp x19, #0x4\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr b16, [x24], #0x1\n"
+      "sshll v16.8h, v16.8b, #0x0\n"
+      "str h16, [x23, #0x0]\n"
+      "sub x19, x19, #0x1\n"
+      "add x23, x23, #0x2\n"
+      "cmp x19, #0x1\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x18\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<12, 1, true, VLType::None>(
+    int16_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_s8s16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
new file mode 100644
index 0000000000..34fb0ed5ac
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12_u8u16.hpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_12_u8u16(uint16_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 12 * height * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add x22, x24, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q16, [x24], #0x10\n"
+      "ushll v27.8h, v16.8b, #0x0\n"
+      "ldr d17, [x24], #0x8\n"
+      "sub x19, x19, #0x18\n"
+      "ushll2 v16.8h, v16.16b, #0x0\n"
+      "ldr q26, [x22], #0x10\n"
+      "cmp x19, #0x18\n"
+      "dup v20.2d, v16.d[0]\n"
+      "ldr q25, [x21], #0x10\n"
+      "dup v24.2d, v16.d[1]\n"
+      "ldr q23, [x20], #0x10\n"
+      "ushll v16.8h, v17.8b, #0x0\n"
+      "ldr d19, [x22], #0x8\n"
+      "mov v24.d[1], v16.d[0]\n"
+      "dup v22.2d, v16.d[1]\n"
+      "ldr d18, [x21], #0x8\n"
+      "ushll v16.8h, v26.8b, #0x0\n"
+      "ldr d21, [x20], #0x8\n"
+      "mov v20.d[1], v16.d[0]\n"
+      "str q27, [x23, #0x0]\n"
+      "dup v17.2d, v16.d[1]\n"
+      "str q20, [x23, #0x10]\n"
+      "ushll2 v16.8h, v26.16b, #0x0\n"
+      "mov v17.d[1], v16.d[0]\n"
+      "str q17, [x23, #0x20]\n"
+      "mov v22.d[1], v16.d[1]\n"
+      "ushll v20.8h, v19.8b, #0x0\n"
+      "ushll v16.8h, v25.8b, #0x0\n"
+      "str q16, [x23, #0x30]\n"
+      "ushll2 v16.8h, v25.16b, #0x0\n"
+      "dup v17.2d, v16.d[0]\n"
+      "dup v19.2d, v16.d[1]\n"
+      "ushll v16.8h, v18.8b, #0x0\n"
+      "mov v19.d[1], v16.d[0]\n"
+      "dup v18.2d, v16.d[1]\n"
+      "ushll v16.8h, v23.8b, #0x0\n"
+      "mov v17.d[1], v16.d[0]\n"
+      "str q17, [x23, #0x40]\n"
+      "dup v17.2d, v16.d[1]\n"
+      "ushll2 v16.8h, v23.16b, #0x0\n"
+      "mov v17.d[1], v16.d[0]\n"
+      "str q17, [x23, #0x50]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "mov v18.d[1], v16.d[1]\n"
+      "str q24, [x23, #0x0]\n"
+      "ushll v16.8h, v21.8b, #0x0\n"
+      "str q22, [x23, #0x10]\n"
+      "str q20, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q18, [x23, #0x40]\n"
+      "str q16, [x23, #0x50]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr d16, [x24], #0x8\n"
+      "sub x19, x19, #0xc\n"
+      "ldr d21, [x22], #0x8\n"
+      "cmp x19, #0xc\n"
+      "ldr d20, [x21], #0x8\n"
+      "ldr d19, [x20], #0x8\n"
+      "ld1 { v16.s }[2], [x24], #0x4\n"
+      "ushll v17.8h, v16.8b, #0x0\n"
+      "ld1 { v21.s }[2], [x22], #0x4\n"
+      "ushll2 v18.8h, v16.16b, #0x0\n"
+      "ld1 { v20.s }[2], [x21], #0x4\n"
+      "ld1 { v19.s }[2], [x20], #0x4\n"
+      "ushll v16.8h, v21.8b, #0x0\n"
+      "str q17, [x23, #0x0]\n"
+      "ushll2 v17.8h, v21.16b, #0x0\n"
+      "mov v18.d[1], v16.d[0]\n"
+      "str q18, [x23, #0x10]\n"
+      "dup v16.2d, v16.d[1]\n"
+      "mov v16.d[1], v17.d[0]\n"
+      "str q16, [x23, #0x20]\n"
+      "ushll v16.8h, v20.8b, #0x0\n"
+      "str q16, [x23, #0x30]\n"
+      "ushll2 v17.8h, v20.16b, #0x0\n"
+      "ushll v16.8h, v19.8b, #0x0\n"
+      "mov v17.d[1], v16.d[0]\n"
+      "str q17, [x23, #0x40]\n"
+      "dup v17.2d, v16.d[1]\n"
+      "ushll2 v16.8h, v19.16b, #0x0\n"
+      "mov v17.d[1], v16.d[0]\n"
+      "str q17, [x23, #0x50]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr s16, [x24], #0x4\n"
+      "ushll v19.8h, v16.8b, #0x0\n"
+      "ldr s16, [x22], #0x4\n"
+      "sub x19, x19, #0x4\n"
+      "ushll v18.8h, v16.8b, #0x0\n"
+      "ldr s16, [x21], #0x4\n"
+      "cmp x19, #0x4\n"
+      "ushll v17.8h, v16.8b, #0x0\n"
+      "ldr s16, [x20], #0x4\n"
+      "str d19, [x23, #0x0]\n"
+      "ushll v16.8h, v16.8b, #0x0\n"
+      "str d18, [x23, #0x18]\n"
+      "str d17, [x23, #0x30]\n"
+      "str d16, [x23, #0x48]\n"
+      "add x23, x23, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr b16, [x24], #0x1\n"
+      "ushll v19.8h, v16.8b, #0x0\n"
+      "ldr b16, [x22], #0x1\n"
+      "sub x19, x19, #0x1\n"
+      "ushll v18.8h, v16.8b, #0x0\n"
+      "ldr b16, [x21], #0x1\n"
+      "cmp x19, #0x1\n"
+      "ushll v17.8h, v16.8b, #0x0\n"
+      "ldr b16, [x20], #0x1\n"
+      "str h19, [x23, #0x0]\n"
+      "ushll v16.8h, v16.8b, #0x0\n"
+      "str h18, [x23, #0x18]\n"
+      "str h17, [x23, #0x30]\n"
+      "str h16, [x23, #0x48]\n"
+      "add x23, x23, #0x2\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x60\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q17, [x24], #0x10\n"
+      "ushll v16.8h, v17.8b, #0x0\n"
+      "ldr d18, [x24], #0x8\n"
+      "sub x19, x19, #0x18\n"
+      "ushll2 v17.8h, v17.16b, #0x0\n"
+      "str q16, [x23, #0x0]\n"
+      "cmp x19, #0x18\n"
+      "dup v16.2d, v17.d[0]\n"
+      "str d16, [x23, #0x10]\n"
+      "dup v17.2d, v17.d[1]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "ushll v16.8h, v18.8b, #0x0\n"
+      "mov v17.d[1], v16.d[0]\n"
+      "str q17, [x23, #0x0]\n"
+      "dup v16.2d, v16.d[1]\n"
+      "str d16, [x23, #0x10]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr d17, [x24], #0x8\n"
+      "sub x19, x19, #0xc\n"
+      "cmp x19, #0xc\n"
+      "ld1 { v17.s }[2], [x24], #0x4\n"
+      "ushll v16.8h, v17.8b, #0x0\n"
+      "str q16, [x23, #0x0]\n"
+      "ushll2 v16.8h, v17.16b, #0x0\n"
+      "str d16, [x23, #0x10]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr s16, [x24], #0x4\n"
+      "ushll v16.8h, v16.8b, #0x0\n"
+      "str d16, [x23, #0x0]\n"
+      "sub x19, x19, #0x4\n"
+      "add x23, x23, #0x8\n"
+      "cmp x19, #0x4\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr b16, [x24], #0x1\n"
+      "ushll v16.8h, v16.8b, #0x0\n"
+      "str h16, [x23, #0x0]\n"
+      "sub x19, x19, #0x1\n"
+      "add x23, x23, #0x2\n"
+      "cmp x19, #0x1\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x18\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<12, 1, true, VLType::None>(
+    uint16_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_12_u8u16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
deleted file mode 100644
index f6233ef503..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "transpose_interleave_common.hpp"
-
-// Generic unblocked transposed 6x32-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<6, 1, true, 4, 4, VLType::None>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a 12 x uint16_t specialisation
-  TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t *>(in),
-    stride*2, x0*2, xmax*2, k0, kmax
-  );
-}
-
-// Generic 12x16-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a uint16_t specialisation
-  Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t *>(in),
-    stride, x0, xmax, k0, kmax
-  );
-}
-
-// Specialised 12 x uint16_t version
-template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
-  __asm volatile (
-    "LDR q0, [%[in0]]\n"
-    "STR q0, [%[out]]\n"
-    "LDR d1, [%[in0], #0x10]\n"
-    "STR d1, [%[out], #0x10]\n"
-    "ADD %x[in0], %x[in0], #0x18\n"
-    ASM_PREFETCH("[%[in0], #192]")
-    : [in0] "+r" (in0),
-      [out] "+r" (out)
-    :
-    : "v0", "v1", "memory"
-  );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
-  __asm volatile (
-    "LDR q0, [%[in0]]\n"
-    "LDR d1, [%[in0], #0x10]\n"
-    "ADD %x[in0], %x[in0], #0x18\n"
-    ASM_PREFETCH("[%[in0], #192]")
-
-    "LDR x21, [%[in1]]\n"
-    "LDR q2, [%[in1], #0x08]\n"
-    "INS v1.d[1], x21\n"
-    "ADD %x[in1], %x[in1], #0x18\n"
-    "STP q0, q1, [%[out]]\n"
-    "STR q2, [%x[out], #0x20]\n"
-    ASM_PREFETCH("[%[in1], #192]")
-    : [in0] "+r" (in0),
-      [in1] "+r" (in1),
-      [out] "+r" (out)
-    :
-    : "x21", "v0", "v1", "v2", "memory"
-  );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
-  __asm __volatile (
-    "LDR q0, [%x[in0]], #0x10\n"
-    "STR q0, [%x[out]]\n"
-    "LDR d1, [%x[in0]], #0x08\n"
-    ASM_PREFETCH("[%[in0], #192]")
-    "STR d1, [%x[out], #0x10]\n"
-
-    "LDR q0, [%x[in1]], #0x10\n"
-    "STR q0, [%x[out], #0x18]\n"
-    "LDR d1, [%x[in1]], #0x08\n"
-    ASM_PREFETCH("[%[in1], #192]")
-    "STR d1, [%x[out], #0x28]\n"
-
-    "LDR q0, [%x[in2]], #0x10\n"
-    "STR q0, [%x[out], #0x30]\n"
-    "LDR d1, [%x[in2]], #0x08\n"
-    ASM_PREFETCH("[%[in2], #192]")
-    "STR d1, [%x[out], #0x40]\n"
-
-    "LDR q0, [%x[in3]], #0x10\n"
-    "STR q0, [%x[out], #0x48]\n"
-    "LDR d1, [%x[in3]], #0x08\n"
-    ASM_PREFETCH("[%[in3], #192]")
-    "STR d1, [%x[out], #0x58]\n"
-    : [in0] "+r" (in0),
-      [in1] "+r" (in1),
-      [in2] "+r" (in2),
-      [in3] "+r" (in3),
-      [out] "+r" (out)
-    :
-    : "v0", "v1", "memory"
-  );
-}
-
-template <>
-template <>
-inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
-    uint16_t* out, const uint16_t* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
deleted file mode 100644
index c0f3e17d31..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
-
-#include "transpose_interleave_common.hpp"
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out) {
-    __asm __volatile (
-        "LDR    q0, [%[in0]], #16\n"
-        "FCVTL2	v1.4s, v0.8h\n"
-        "FCVTL	v0.4s, v0.4h\n"
-        "STP    q0, q1, [%[out]]\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDR    d2, [%[in0]], #8\n"
-        "FCVTL	v2.4s, v2.4h\n"
-        "STR    q2, [%[out], #32]\n"
-    : [in0] "+r" (in0), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "memory"
-    );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out) {
-    __asm __volatile (
-        "LDR    q0, [%[in0]], #16\n"
-        "FCVTL2	v1.4s, v0.8h\n"
-        "FCVTL	v0.4s, v0.4h\n"
-        "STP    q0, q1, [%[out]]\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDR    d2, [%[in0]], #8\n"
-        "FCVTL	v2.4s, v2.4h\n"
-        "LDR	q3, [%[in1]], #16\n"
-        "FCVTL2	v4.4s, v3.8h\n"
-        "FCVTL	v3.4s, v3.4h\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        ASM_PREFETCH("[%[in1], #192]")
-        "LDR	d5, [%[in1]], #8\n"
-        "FCVTL	v5.4s, v5.4h\n"
-        "STP    q4, q5, [%[out], #64]\n"
-    : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "v3", "v4", "v5", "memory"
-    );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out) {
-    __asm __volatile (
-        "LDR    q0, [%[in0]], #16\n"
-        "FCVTL2	v1.4s, v0.8h\n"
-        "FCVTL	v0.4s, v0.4h\n"
-        "STP    q0, q1, [%[out]]\n"
-        "LDR    d2, [%[in0]], #8\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "FCVTL	v2.4s, v2.4h\n"
-        "LDR	q3, [%[in1]], #16\n"
-        "FCVTL2	v4.4s, v3.8h\n"
-        "FCVTL	v3.4s, v3.4h\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        "LDR	d5, [%[in1]], #8\n"
-        "FCVTL	v5.4s, v5.4h\n"
-        ASM_PREFETCH("[%[in1], #192]")
-        "STP    q4, q5, [%[out], #64]\n"
-        "LDR	q6, [%[in2]], #16\n"
-        "FCVTL2	v7.4s, v6.8h\n"
-        "FCVTL	v6.4s, v6.4h\n"
-        "STP    q6, q7, [%[out], #96]\n"
-        "LDR	d8, [%[in2]], #8\n"
-        "FCVTL	v8.4s, v8.4h\n"
-        ASM_PREFETCH("[%[in2], #192]")
-        "LDR	q9, [%[in3]], #16\n"
-        "FCVTL2	v10.4s, v9.8h\n"
-        "FCVTL	v9.4s, v9.4h\n"
-        "STP    q8, q9, [%[out], #128]\n"
-        "LDR	d11, [%[in3]], #8\n"
-        "FCVTL	v11.4s, v11.4h\n"
-        "STP    q10, q11, [%[out], #160]\n"
-        ASM_PREFETCH("[%[in3], #192]")
-
-    : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
-    );
-}
-
-template <>
-template <>
-inline void TransformImpl<12, 1, true, 4, 2, VLType::None>::Transform(
-    float* out, const __fp16* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp
new file mode 100644
index 0000000000..dd1bd508ef
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16.hpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_16(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 4 * height * sizeof(uint32_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add x22, x24, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q19, [x24], #0x10\n"
+      "sub x19, x19, #0x4\n"
+      "ldr q18, [x22], #0x10\n"
+      "cmp x19, #0x4\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q19, [x23, #0x0]\n"
+      "str q18, [x23, #0x10]\n"
+      "str q17, [x23, #0x20]\n"
+      "str q16, [x23, #0x30]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x19, #0x1\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 1 loop: loop
+      "ldr s19, [x24], #0x4\n"
+      "sub x19, x19, #0x1\n"
+      "ldr s18, [x22], #0x4\n"
+      "cmp x19, #0x1\n"
+      "ldr s17, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "str s19, [x23, #0x0]\n"
+      "str s18, [x23, #0x10]\n"
+      "str s17, [x23, #0x20]\n"
+      "str s16, [x23, #0x30]\n"
+      "add x23, x23, #0x4\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x40\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+
+      "7:"  // Tail row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x4\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Column loop
+      "ldr q16, [x24], #0x10\n"
+      "sub x19, x19, #0x4\n"
+      "cmp x19, #0x4\n"
+      "str q16, [x23, #0x0]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x1\n"
+      "blt 11f\n"
+      "10:"  // Tail row loop: width 1 loop: loop
+      "ldr s16, [x24], #0x4\n"
+      "sub x19, x19, #0x1\n"
+      "cmp x19, #0x1\n"
+      "str s16, [x23, #0x0]\n"
+      "add x23, x23, #0x4\n"
+      "bge 10b\n"
+      "11:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x10\n"
+      "cmp %x[height], #0x1\n"
+      "bge 7b\n"
+      "12:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "x19", "x20", "x21", "x22", "x23", "x24"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 1, true, VLType::None>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16(
+        reinterpret_cast<uint32_t *>(out),
+        reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 4,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
new file mode 100644
index 0000000000..7e7fcf5b8b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x4.hpp
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_16_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 16 * roundup<size_t>(height, 4) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x10\n"
+      "blt 8f\n"
+      "1:"  // Main row loop: Head
+      "mov x16, %x[in]\n"
+      "mov x15, %x[out]\n"
+      "add x14, x16, %x[in_stride]\n"
+      "add x13, x14, %x[in_stride]\n"
+      "add x12, x13, %x[in_stride]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x10\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x10\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q18, [x16], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q20, [x14], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q17, [x13], #0x10\n"
+      "zip1 v19.16b, v18.16b, v17.16b\n"
+      "ldr q16, [x12], #0x10\n"
+      "zip2 v18.16b, v18.16b, v17.16b\n"
+      "ldr q3, [x11], #0x10\n"
+      "ldr q2, [x10], #0x10\n"
+      "zip1 v17.16b, v20.16b, v16.16b\n"
+      "ldr q1, [x9], #0x10\n"
+      "zip2 v16.16b, v20.16b, v16.16b\n"
+      "ldr q0, [x28], #0x10\n"
+      "zip1 v31.16b, v19.16b, v17.16b\n"
+      "ldr q30, [x27], #0x10\n"
+      "zip2 v20.16b, v19.16b, v17.16b\n"
+      "ldr q29, [x26], #0x10\n"
+      "zip1 v19.16b, v18.16b, v16.16b\n"
+      "ldr q28, [x25], #0x10\n"
+      "zip2 v18.16b, v18.16b, v16.16b\n"
+      "ldr q27, [x24], #0x10\n"
+      "zip1 v17.16b, v3.16b, v1.16b\n"
+      "ldr q26, [x23], #0x10\n"
+      "zip1 v16.16b, v2.16b, v0.16b\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip1 v24.16b, v17.16b, v16.16b\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip2 v22.16b, v17.16b, v16.16b\n"
+      "ldr q21, [x20], #0x10\n"
+      "zip2 v17.16b, v3.16b, v1.16b\n"
+      "str q31, [x15, #0x0]\n"
+      "zip2 v16.16b, v2.16b, v0.16b\n"
+      "str q20, [x15, #0x10]\n"
+      "zip1 v20.16b, v17.16b, v16.16b\n"
+      "str q19, [x15, #0x20]\n"
+      "zip2 v19.16b, v17.16b, v16.16b\n"
+      "str q18, [x15, #0x30]\n"
+      "zip1 v18.16b, v30.16b, v28.16b\n"
+      "str q24, [x15, #0x40]\n"
+      "zip1 v16.16b, v29.16b, v27.16b\n"
+      "str q22, [x15, #0x50]\n"
+      "zip1 v17.16b, v18.16b, v16.16b\n"
+      "str q20, [x15, #0x60]\n"
+      "zip2 v16.16b, v18.16b, v16.16b\n"
+      "str q19, [x15, #0x70]\n"
+      "zip2 v18.16b, v30.16b, v28.16b\n"
+      "str q17, [x15, #0x80]\n"
+      "zip2 v17.16b, v29.16b, v27.16b\n"
+      "str q16, [x15, #0x90]\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0xa0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0xb0]\n"
+      "zip1 v18.16b, v26.16b, v23.16b\n"
+      "zip1 v17.16b, v25.16b, v21.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0xc0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0xd0]\n"
+      "zip2 v18.16b, v26.16b, v23.16b\n"
+      "zip2 v17.16b, v25.16b, v21.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0xe0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0xf0]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 4 loop: loop
+      "ldr s18, [x16], #0x4\n"
+      "sub x19, x19, #0x4\n"
+      "ldr s17, [x14], #0x4\n"
+      "cmp x19, #0x4\n"
+      "ldr s16, [x13], #0x4\n"
+      "zip1 v19.16b, v18.16b, v16.16b\n"
+      "ldr s16, [x12], #0x4\n"
+      "ldr s18, [x11], #0x4\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "ldr s20, [x10], #0x4\n"
+      "ldr s17, [x9], #0x4\n"
+      "zip1 v23.16b, v19.16b, v16.16b\n"
+      "ldr s16, [x28], #0x4\n"
+      "zip1 v19.16b, v18.16b, v17.16b\n"
+      "ldr s18, [x27], #0x4\n"
+      "ldr s22, [x26], #0x4\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "ldr s17, [x25], #0x4\n"
+      "zip1 v21.16b, v19.16b, v16.16b\n"
+      "ldr s16, [x24], #0x4\n"
+      "zip1 v18.16b, v18.16b, v17.16b\n"
+      "ldr s20, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "zip1 v16.16b, v22.16b, v16.16b\n"
+      "ldr s17, [x21], #0x4\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "str q23, [x15, #0x0]\n"
+      "str q21, [x15, #0x40]\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "str q18, [x15, #0x80]\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x15, #0xc0]\n"
+      "add x15, x15, #0x10\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 1 loop: loop
+      "ldr b18, [x16], #0x1\n"
+      "sub x19, x19, #0x1\n"
+      "ldr b17, [x14], #0x1\n"
+      "cmp x19, #0x1\n"
+      "ldr b16, [x13], #0x1\n"
+      "zip1 v19.16b, v18.16b, v16.16b\n"
+      "ldr b16, [x12], #0x1\n"
+      "ldr b18, [x11], #0x1\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "ldr b20, [x10], #0x1\n"
+      "ldr b17, [x9], #0x1\n"
+      "zip1 v23.16b, v19.16b, v16.16b\n"
+      "ldr b16, [x28], #0x1\n"
+      "zip1 v19.16b, v18.16b, v17.16b\n"
+      "ldr b18, [x27], #0x1\n"
+      "ldr b22, [x26], #0x1\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "ldr b17, [x25], #0x1\n"
+      "zip1 v21.16b, v19.16b, v16.16b\n"
+      "ldr b16, [x24], #0x1\n"
+      "zip1 v18.16b, v18.16b, v17.16b\n"
+      "ldr b20, [x23], #0x1\n"
+      "ldr b19, [x22], #0x1\n"
+      "zip1 v16.16b, v22.16b, v16.16b\n"
+      "ldr b17, [x21], #0x1\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr b16, [x20], #0x1\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "str s23, [x15, #0x0]\n"
+      "str s21, [x15, #0x40]\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "str s18, [x15, #0x80]\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s16, [x15, #0xc0]\n"
+      "add x15, x15, #0x4\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x100\n"
+      "cmp %x[height], #0x10\n"
+      "bge 1b\n"
+      "cbz %x[height], 16f\n"
+      "8:"  // Main loop skip
+
+      "9:"  // Tail row loop: Head
+      "mov x16, %x[in]\n"
+      "mov x15, %x[out]\n"
+      "add x14, x16, %x[in_stride]\n"
+      "add x13, x14, %x[in_stride]\n"
+      "add x12, x13, %x[in_stride]\n"
+      "add %x[in], x12, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x12, x12, %x[pad_row], GT\n"
+      "csel x13, x13, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x14, x14, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x10\n"
+      "blt 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "ldr q18, [x16], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q21, [x14], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q17, [x13], #0x10\n"
+      "zip1 v20.16b, v18.16b, v17.16b\n"
+      "ldr q16, [x12], #0x10\n"
+      "zip2 v19.16b, v18.16b, v17.16b\n"
+      "zip1 v18.16b, v21.16b, v16.16b\n"
+      "zip2 v17.16b, v21.16b, v16.16b\n"
+      "zip1 v16.16b, v20.16b, v18.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "zip2 v16.16b, v20.16b, v18.16b\n"
+      "str q16, [x15, #0x10]\n"
+      "zip1 v16.16b, v19.16b, v17.16b\n"
+      "str q16, [x15, #0x20]\n"
+      "zip2 v16.16b, v19.16b, v17.16b\n"
+      "str q16, [x15, #0x30]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "bge 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: width 4 loop: loop
+      "ldr s17, [x16], #0x4\n"
+      "sub x19, x19, #0x4\n"
+      "ldr s18, [x14], #0x4\n"
+      "cmp x19, #0x4\n"
+      "ldr s16, [x13], #0x4\n"
+      "zip1 v17.16b, v17.16b, v16.16b\n"
+      "ldr s16, [x12], #0x4\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "add x15, x15, #0x10\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 1 loop: loop
+      "ldr b17, [x16], #0x1\n"
+      "sub x19, x19, #0x1\n"
+      "ldr b18, [x14], #0x1\n"
+      "cmp x19, #0x1\n"
+      "ldr b16, [x13], #0x1\n"
+      "zip1 v17.16b, v17.16b, v16.16b\n"
+      "ldr b16, [x12], #0x1\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s16, [x15, #0x0]\n"
+      "add x15, x15, #0x4\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x40\n"
+      "cmp %x[height], #0x1\n"
+      "bge 9b\n"
+      "16:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 4, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<16, 4, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
new file mode 100644
index 0000000000..f52fbbae4d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_1x8.hpp
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_16_1x8(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 8) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 16 * roundup<size_t>(height, 8) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+
+      "1:"  // Main row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "cmp %x[height], #0x7\n"
+      "csel x20, x20, %x[pad_row], GT\n"
+      "csel x21, x21, %x[pad_row], GE\n"
+      "cmp %x[height], #0x5\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "csel x23, x23, %x[pad_row], GE\n"
+      "cmp %x[height], #0x3\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x20\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q17, [x28], #0x10\n"
+      "sub x19, x19, #0x20\n"
+      "ldr q19, [x26], #0x10\n"
+      "cmp x19, #0x20\n"
+      "ldr q4, [x25], #0x10\n"
+      "ldr q3, [x24], #0x10\n"
+      "ldr q16, [x23], #0x10\n"
+      "zip1 v2.16b, v17.16b, v16.16b\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v1.16b, v17.16b, v16.16b\n"
+      "ldr q0, [x26], #0x10\n"
+      "ldr q31, [x25], #0x10\n"
+      "ldr q30, [x24], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "zip1 v29.16b, v18.16b, v17.16b\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip2 v28.16b, v18.16b, v17.16b\n"
+      "ldr q27, [x21], #0x10\n"
+      "ldr q26, [x20], #0x10\n"
+      "zip1 v25.16b, v19.16b, v16.16b\n"
+      "ldr q24, [x22], #0x10\n"
+      "zip2 v21.16b, v19.16b, v16.16b\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip1 v20.16b, v4.16b, v27.16b\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v18.16b, v2.16b, v20.16b\n"
+      "zip1 v19.16b, v3.16b, v26.16b\n"
+      "zip1 v17.16b, v25.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip2 v18.16b, v2.16b, v20.16b\n"
+      "zip2 v17.16b, v25.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x30]\n"
+      "zip2 v20.16b, v4.16b, v27.16b\n"
+      "zip1 v18.16b, v1.16b, v20.16b\n"
+      "zip2 v19.16b, v3.16b, v26.16b\n"
+      "zip1 v17.16b, v21.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x50]\n"
+      "zip2 v18.16b, v1.16b, v20.16b\n"
+      "zip2 v17.16b, v21.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x60]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x70]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip1 v21.16b, v31.16b, v23.16b\n"
+      "zip1 v20.16b, v0.16b, v24.16b\n"
+      "zip1 v18.16b, v29.16b, v21.16b\n"
+      "zip1 v19.16b, v30.16b, v22.16b\n"
+      "zip1 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip2 v18.16b, v29.16b, v21.16b\n"
+      "zip2 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x30]\n"
+      "zip2 v21.16b, v31.16b, v23.16b\n"
+      "zip1 v18.16b, v28.16b, v21.16b\n"
+      "zip2 v20.16b, v0.16b, v24.16b\n"
+      "zip2 v19.16b, v30.16b, v22.16b\n"
+      "zip1 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x50]\n"
+      "zip2 v18.16b, v28.16b, v21.16b\n"
+      "zip2 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x60]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x70]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q19, [x28], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q18, [x26], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q28, [x25], #0x10\n"
+      "ldr q27, [x24], #0x10\n"
+      "ldr q17, [x23], #0x10\n"
+      "zip1 v26.16b, v19.16b, v17.16b\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip2 v25.16b, v19.16b, v17.16b\n"
+      "ldr q24, [x21], #0x10\n"
+      "ldr q23, [x20], #0x10\n"
+      "zip1 v22.16b, v18.16b, v16.16b\n"
+      "zip2 v21.16b, v18.16b, v16.16b\n"
+      "zip1 v20.16b, v28.16b, v24.16b\n"
+      "zip1 v18.16b, v26.16b, v20.16b\n"
+      "zip1 v19.16b, v27.16b, v23.16b\n"
+      "zip1 v17.16b, v22.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "zip2 v18.16b, v26.16b, v20.16b\n"
+      "zip2 v17.16b, v22.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x30]\n"
+      "zip2 v20.16b, v28.16b, v24.16b\n"
+      "zip1 v18.16b, v25.16b, v20.16b\n"
+      "zip2 v19.16b, v27.16b, v23.16b\n"
+      "zip1 v17.16b, v21.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x50]\n"
+      "zip2 v18.16b, v25.16b, v20.16b\n"
+      "zip2 v17.16b, v21.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x60]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x70]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr s17, [x28], #0x4\n"
+      "sub x19, x19, #0x4\n"
+      "ldr s21, [x26], #0x4\n"
+      "cmp x19, #0x4\n"
+      "ldr s18, [x25], #0x4\n"
+      "ldr s20, [x24], #0x4\n"
+      "ldr s16, [x23], #0x4\n"
+      "zip1 v19.16b, v17.16b, v16.16b\n"
+      "ldr s17, [x22], #0x4\n"
+      "ldr s16, [x21], #0x4\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v17.16b, v21.16b, v17.16b\n"
+      "zip1 v18.16b, v19.16b, v18.16b\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "zip1 v17.16b, v17.16b, v16.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x27, #0x10]\n"
+      "add x27, x27, #0x20\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr b18, [x28], #0x1\n"
+      "sub x19, x19, #0x1\n"
+      "ldr b21, [x26], #0x1\n"
+      "cmp x19, #0x1\n"
+      "ldr b17, [x25], #0x1\n"
+      "ldr b20, [x24], #0x1\n"
+      "ldr b16, [x23], #0x1\n"
+      "zip1 v19.16b, v18.16b, v16.16b\n"
+      "ldr b18, [x22], #0x1\n"
+      "ldr b16, [x21], #0x1\n"
+      "zip1 v17.16b, v17.16b, v16.16b\n"
+      "ldr b16, [x20], #0x1\n"
+      "zip1 v18.16b, v21.16b, v18.16b\n"
+      "zip1 v17.16b, v19.16b, v17.16b\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str d16, [x27, #0x0]\n"
+      "add x27, x27, #0x8\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x80\n"
+      "cmp %x[height], #0x1\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 8, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<16, 8, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
new file mode 100644
index 0000000000..cfac12a84a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x2.hpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_16_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 16 * roundup<size_t>(height, 2) * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 8f\n"
+      "1:"  // Main row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x10\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q18, [x28], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v1.8h, v18.8h, v16.8h\n"
+      "ldr q17, [x28], #0x10\n"
+      "cmp x19, #0x10\n"
+      "zip2 v0.8h, v18.8h, v16.8h\n"
+      "ldr q16, [x26], #0x10\n"
+      "ldr q19, [x25], #0x10\n"
+      "zip1 v31.8h, v17.8h, v16.8h\n"
+      "ldr q18, [x25], #0x10\n"
+      "zip2 v30.8h, v17.8h, v16.8h\n"
+      "ldr q16, [x24], #0x10\n"
+      "ldr q20, [x23], #0x10\n"
+      "zip1 v29.8h, v19.8h, v16.8h\n"
+      "ldr q17, [x24], #0x10\n"
+      "zip2 v28.8h, v19.8h, v16.8h\n"
+      "ldr q19, [x23], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v27.8h, v18.8h, v17.8h\n"
+      "ldr q26, [x21], #0x10\n"
+      "zip2 v25.8h, v18.8h, v17.8h\n"
+      "ldr q18, [x22], #0x10\n"
+      "zip1 v24.8h, v20.8h, v16.8h\n"
+      "ldr q17, [x20], #0x10\n"
+      "zip2 v23.8h, v20.8h, v16.8h\n"
+      "ldr q22, [x21], #0x10\n"
+      "zip1 v21.8h, v19.8h, v18.8h\n"
+      "ldr q16, [x20], #0x10\n"
+      "zip2 v20.8h, v19.8h, v18.8h\n"
+      "zip1 v19.8h, v26.8h, v17.8h\n"
+      "str q1, [x27, #0x0]\n"
+      "zip2 v18.8h, v26.8h, v17.8h\n"
+      "str q0, [x27, #0x10]\n"
+      "str q31, [x27, #0x20]\n"
+      "zip1 v17.8h, v22.8h, v16.8h\n"
+      "str q30, [x27, #0x30]\n"
+      "zip2 v16.8h, v22.8h, v16.8h\n"
+      "str q29, [x27, #0x40]\n"
+      "str q28, [x27, #0x50]\n"
+      "str q27, [x27, #0x60]\n"
+      "str q25, [x27, #0x70]\n"
+      "str q24, [x27, #0x80]\n"
+      "str q23, [x27, #0x90]\n"
+      "str q21, [x27, #0xa0]\n"
+      "str q20, [x27, #0xb0]\n"
+      "str q19, [x27, #0xc0]\n"
+      "str q18, [x27, #0xd0]\n"
+      "str q17, [x27, #0xe0]\n"
+      "str q16, [x27, #0xf0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 4 loop: loop
+      "ldr d17, [x28], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "ldr d16, [x26], #0x8\n"
+      "zip1 v20.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x25], #0x8\n"
+      "cmp x19, #0x4\n"
+      "ldr d16, [x24], #0x8\n"
+      "zip1 v19.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x23], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "zip1 v18.8h, v17.8h, v16.8h\n"
+      "ldr d17, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q20, [x27, #0x0]\n"
+      "str q19, [x27, #0x40]\n"
+      "str q18, [x27, #0x80]\n"
+      "str q16, [x27, #0xc0]\n"
+      "add x27, x27, #0x10\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 1 loop: loop
+      "ldr h17, [x28], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "ldr h16, [x26], #0x2\n"
+      "zip1 v20.8h, v17.8h, v16.8h\n"
+      "ldr h17, [x25], #0x2\n"
+      "cmp x19, #0x1\n"
+      "ldr h16, [x24], #0x2\n"
+      "zip1 v19.8h, v17.8h, v16.8h\n"
+      "ldr h17, [x23], #0x2\n"
+      "ldr h16, [x22], #0x2\n"
+      "zip1 v18.8h, v17.8h, v16.8h\n"
+      "ldr h17, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str s20, [x27, #0x0]\n"
+      "str s19, [x27, #0x40]\n"
+      "str s18, [x27, #0x80]\n"
+      "str s16, [x27, #0xc0]\n"
+      "add x27, x27, #0x4\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x100\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 16f\n"
+      "8:"  // Main loop skip
+
+      "9:"  // Tail row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x10\n"
+      "blt 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "ldr q18, [x28], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q16, [x26], #0x10\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "ldr q19, [x28], #0x10\n"
+      "cmp x19, #0x10\n"
+      "zip2 v18.8h, v18.8h, v16.8h\n"
+      "ldr q16, [x26], #0x10\n"
+      "str q17, [x27, #0x0]\n"
+      "zip1 v17.8h, v19.8h, v16.8h\n"
+      "str q18, [x27, #0x10]\n"
+      "zip2 v16.8h, v19.8h, v16.8h\n"
+      "str q17, [x27, #0x20]\n"
+      "str q16, [x27, #0x30]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: width 4 loop: loop
+      "ldr d17, [x28], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "ldr d16, [x26], #0x8\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [x27, #0x0]\n"
+      "add x27, x27, #0x10\n"
+      "cmp x19, #0x4\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 1 loop: loop
+      "ldr h17, [x28], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "ldr h16, [x26], #0x2\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str s16, [x27, #0x0]\n"
+      "add x27, x27, #0x4\n"
+      "cmp x19, #0x1\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x40\n"
+      "cmp %x[height], #0x1\n"
+      "bge 9b\n"
+      "16:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 2, true, VLType::None>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
new file mode 100644
index 0000000000..8c8dfd1d0d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4.hpp
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_16_2x4(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 16 * roundup<size_t>(height, 4) * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x20\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q24, [x28], #0x10\n"
+      "sub x19, x19, #0x20\n"
+      "ldr q4, [x26], #0x10\n"
+      "cmp x19, #0x20\n"
+      "ldr q26, [x25], #0x10\n"
+      "zip1 v2.8h, v24.8h, v26.8h\n"
+      "ldr q3, [x28], #0x10\n"
+      "zip2 v9.8h, v24.8h, v26.8h\n"
+      "ldr q0, [x26], #0x10\n"
+      "ldr q22, [x25], #0x10\n"
+      "zip1 v31.8h, v3.8h, v22.8h\n"
+      "ldr q23, [x28], #0x10\n"
+      "zip2 v25.8h, v3.8h, v22.8h\n"
+      "ldr q22, [x26], #0x10\n"
+      "ldr q5, [x25], #0x10\n"
+      "zip1 v17.8h, v23.8h, v5.8h\n"
+      "ldr q19, [x28], #0x10\n"
+      "zip2 v20.8h, v23.8h, v5.8h\n"
+      "ldr q1, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v15.8h, v19.8h, v16.8h\n"
+      "ldr q8, [x24], #0x10\n"
+      "zip2 v11.8h, v19.8h, v16.8h\n"
+      "ldr q26, [x23], #0x10\n"
+      "ldr q19, [x22], #0x10\n"
+      "zip1 v5.8h, v4.8h, v8.8h\n"
+      "ldr q18, [x24], #0x10\n"
+      "zip2 v6.8h, v4.8h, v8.8h\n"
+      "ldr q7, [x23], #0x10\n"
+      "zip1 v27.8h, v2.8h, v5.8h\n"
+      "ldr q23, [x22], #0x10\n"
+      "zip2 v8.8h, v2.8h, v5.8h\n"
+      "ldr q24, [x21], #0x10\n"
+      "zip1 v12.8h, v9.8h, v6.8h\n"
+      "ldr q13, [x24], #0x10\n"
+      "zip2 v16.8h, v9.8h, v6.8h\n"
+      "ldr q9, [x23], #0x10\n"
+      "zip1 v29.8h, v0.8h, v18.8h\n"
+      "ldr q10, [x22], #0x10\n"
+      "zip1 v14.8h, v31.8h, v29.8h\n"
+      "ldr q4, [x21], #0x10\n"
+      "zip2 v21.8h, v31.8h, v29.8h\n"
+      "ldr q6, [x24], #0x10\n"
+      "zip2 v18.8h, v0.8h, v18.8h\n"
+      "ldr q3, [x23], #0x10\n"
+      "zip1 v0.8h, v25.8h, v18.8h\n"
+      "ldr q31, [x22], #0x10\n"
+      "zip2 v29.8h, v25.8h, v18.8h\n"
+      "ldr q5, [x21], #0x10\n"
+      "zip1 v28.8h, v26.8h, v24.8h\n"
+      "ldr q25, [x20], #0x10\n"
+      "zip2 v26.8h, v26.8h, v24.8h\n"
+      "ldr q30, [x21], #0x10\n"
+      "zip1 v24.8h, v7.8h, v4.8h\n"
+      "zip2 v4.8h, v7.8h, v4.8h\n"
+      "ldr q18, [x20], #0x10\n"
+      "zip1 v7.8h, v19.8h, v25.8h\n"
+      "ldr q2, [x20], #0x10\n"
+      "zip2 v25.8h, v19.8h, v25.8h\n"
+      "zip1 v19.8h, v28.8h, v7.8h\n"
+      "zip2 v7.8h, v28.8h, v7.8h\n"
+      "ldr q28, [x20], #0x10\n"
+      "str q27, [x27, #0x0]\n"
+      "zip1 v27.8h, v26.8h, v25.8h\n"
+      "zip2 v26.8h, v26.8h, v25.8h\n"
+      "str q8, [x27, #0x10]\n"
+      "zip1 v25.8h, v23.8h, v18.8h\n"
+      "str q12, [x27, #0x20]\n"
+      "zip1 v8.8h, v24.8h, v25.8h\n"
+      "str q16, [x27, #0x30]\n"
+      "zip2 v25.8h, v24.8h, v25.8h\n"
+      "str q14, [x27, #0x40]\n"
+      "zip2 v12.8h, v23.8h, v18.8h\n"
+      "str q21, [x27, #0x50]\n"
+      "zip1 v21.8h, v4.8h, v12.8h\n"
+      "str q0, [x27, #0x60]\n"
+      "zip2 v14.8h, v4.8h, v12.8h\n"
+      "str q29, [x27, #0x70]\n"
+      "zip1 v12.8h, v22.8h, v13.8h\n"
+      "str q19, [x27, #0x80]\n"
+      "zip1 v24.8h, v17.8h, v12.8h\n"
+      "str q7, [x27, #0x90]\n"
+      "zip2 v23.8h, v17.8h, v12.8h\n"
+      "str q27, [x27, #0xa0]\n"
+      "zip2 v16.8h, v22.8h, v13.8h\n"
+      "str q26, [x27, #0xb0]\n"
+      "zip1 v19.8h, v20.8h, v16.8h\n"
+      "str q8, [x27, #0xc0]\n"
+      "zip2 v18.8h, v20.8h, v16.8h\n"
+      "str q25, [x27, #0xd0]\n"
+      "zip1 v16.8h, v1.8h, v6.8h\n"
+      "str q21, [x27, #0xe0]\n"
+      "zip1 v21.8h, v15.8h, v16.8h\n"
+      "str q14, [x27, #0xf0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip2 v17.8h, v15.8h, v16.8h\n"
+      "str q24, [x27, #0x0]\n"
+      "zip2 v16.8h, v1.8h, v6.8h\n"
+      "str q23, [x27, #0x10]\n"
+      "zip1 v20.8h, v11.8h, v16.8h\n"
+      "str q19, [x27, #0x20]\n"
+      "zip2 v19.8h, v11.8h, v16.8h\n"
+      "str q18, [x27, #0x30]\n"
+      "zip1 v18.8h, v9.8h, v5.8h\n"
+      "str q21, [x27, #0x40]\n"
+      "zip1 v16.8h, v10.8h, v2.8h\n"
+      "str q17, [x27, #0x50]\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "str q20, [x27, #0x60]\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "str q19, [x27, #0x70]\n"
+      "zip2 v18.8h, v9.8h, v5.8h\n"
+      "str q17, [x27, #0x80]\n"
+      "zip2 v17.8h, v10.8h, v2.8h\n"
+      "str q16, [x27, #0x90]\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xa0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xb0]\n"
+      "zip1 v18.8h, v3.8h, v30.8h\n"
+      "zip1 v17.8h, v31.8h, v28.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xc0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xd0]\n"
+      "zip2 v18.8h, v3.8h, v30.8h\n"
+      "zip2 v17.8h, v31.8h, v28.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xe0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xf0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q17, [x28], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q20, [x26], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v19.8h, v17.8h, v16.8h\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v22.8h, v17.8h, v16.8h\n"
+      "ldr q21, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v0.8h, v18.8h, v17.8h\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v31.8h, v18.8h, v17.8h\n"
+      "ldr q30, [x23], #0x10\n"
+      "ldr q29, [x22], #0x10\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "ldr q18, [x24], #0x10\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "ldr q28, [x23], #0x10\n"
+      "zip1 v27.8h, v19.8h, v17.8h\n"
+      "ldr q26, [x22], #0x10\n"
+      "zip2 v20.8h, v19.8h, v17.8h\n"
+      "ldr q25, [x21], #0x10\n"
+      "zip1 v19.8h, v22.8h, v16.8h\n"
+      "ldr q24, [x20], #0x10\n"
+      "zip2 v23.8h, v22.8h, v16.8h\n"
+      "ldr q22, [x21], #0x10\n"
+      "zip1 v17.8h, v21.8h, v18.8h\n"
+      "zip2 v18.8h, v21.8h, v18.8h\n"
+      "ldr q21, [x20], #0x10\n"
+      "zip1 v16.8h, v0.8h, v17.8h\n"
+      "str q27, [x27, #0x0]\n"
+      "zip2 v17.8h, v0.8h, v17.8h\n"
+      "str q20, [x27, #0x10]\n"
+      "zip1 v20.8h, v31.8h, v18.8h\n"
+      "str q19, [x27, #0x20]\n"
+      "zip2 v19.8h, v31.8h, v18.8h\n"
+      "str q23, [x27, #0x30]\n"
+      "zip1 v18.8h, v30.8h, v25.8h\n"
+      "str q16, [x27, #0x40]\n"
+      "zip1 v16.8h, v29.8h, v24.8h\n"
+      "str q17, [x27, #0x50]\n"
+      "zip1 v17.8h, v18.8h, v16.8h\n"
+      "str q20, [x27, #0x60]\n"
+      "zip2 v16.8h, v18.8h, v16.8h\n"
+      "str q19, [x27, #0x70]\n"
+      "zip2 v18.8h, v30.8h, v25.8h\n"
+      "str q17, [x27, #0x80]\n"
+      "zip2 v17.8h, v29.8h, v24.8h\n"
+      "str q16, [x27, #0x90]\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xa0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xb0]\n"
+      "zip1 v18.8h, v28.8h, v22.8h\n"
+      "zip1 v17.8h, v26.8h, v21.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xc0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xd0]\n"
+      "zip2 v18.8h, v28.8h, v22.8h\n"
+      "zip2 v17.8h, v26.8h, v21.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xe0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0xf0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d17, [x28], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "ldr d18, [x26], #0x8\n"
+      "cmp x19, #0x4\n"
+      "ldr d16, [x25], #0x8\n"
+      "zip1 v17.8h, v17.8h, v16.8h\n"
+      "ldr d16, [x24], #0x8\n"
+      "ldr d21, [x23], #0x8\n"
+      "zip1 v16.8h, v18.8h, v16.8h\n"
+      "ldr d20, [x22], #0x8\n"
+      "ldr d19, [x21], #0x8\n"
+      "zip1 v18.8h, v17.8h, v16.8h\n"
+      "zip2 v17.8h, v17.8h, v16.8h\n"
+      "ldr d16, [x20], #0x8\n"
+      "str q18, [x27, #0x0]\n"
+      "zip1 v18.8h, v21.8h, v19.8h\n"
+      "str q17, [x27, #0x10]\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0x80]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0x90]\n"
+      "add x27, x27, #0x20\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h18, [x28], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "ldr h17, [x26], #0x2\n"
+      "cmp x19, #0x1\n"
+      "ldr h16, [x25], #0x2\n"
+      "zip1 v18.8h, v18.8h, v16.8h\n"
+      "ldr h16, [x24], #0x2\n"
+      "ldr h20, [x23], #0x2\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "ldr h19, [x22], #0x2\n"
+      "ldr h17, [x21], #0x2\n"
+      "zip1 v18.8h, v18.8h, v16.8h\n"
+      "ldr h16, [x20], #0x2\n"
+      "zip1 v17.8h, v20.8h, v17.8h\n"
+      "str d18, [x27, #0x0]\n"
+      "zip1 v16.8h, v19.8h, v16.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str d16, [x27, #0x80]\n"
+      "add x27, x27, #0x8\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x100\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x20\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q17, [x28], #0x10\n"
+      "sub x19, x19, #0x20\n"
+      "ldr q19, [x26], #0x10\n"
+      "cmp x19, #0x20\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v20.8h, v17.8h, v16.8h\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v0.8h, v17.8h, v16.8h\n"
+      "ldr q31, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v30.8h, v18.8h, v16.8h\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v29.8h, v18.8h, v16.8h\n"
+      "ldr q28, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v27.8h, v17.8h, v16.8h\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v26.8h, v17.8h, v16.8h\n"
+      "ldr q25, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v24.8h, v18.8h, v17.8h\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v23.8h, v18.8h, v17.8h\n"
+      "ldr q22, [x24], #0x10\n"
+      "zip1 v17.8h, v19.8h, v16.8h\n"
+      "zip2 v19.8h, v19.8h, v16.8h\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "zip2 v17.8h, v20.8h, v17.8h\n"
+      "ldr q20, [x24], #0x10\n"
+      "zip1 v18.8h, v0.8h, v19.8h\n"
+      "zip2 v19.8h, v0.8h, v19.8h\n"
+      "str q16, [x27, #0x0]\n"
+      "zip1 v16.8h, v31.8h, v22.8h\n"
+      "str q17, [x27, #0x10]\n"
+      "zip1 v17.8h, v30.8h, v16.8h\n"
+      "str q18, [x27, #0x20]\n"
+      "zip2 v18.8h, v30.8h, v16.8h\n"
+      "str q19, [x27, #0x30]\n"
+      "zip2 v16.8h, v31.8h, v22.8h\n"
+      "str q17, [x27, #0x40]\n"
+      "zip1 v17.8h, v29.8h, v16.8h\n"
+      "str q18, [x27, #0x50]\n"
+      "zip2 v16.8h, v29.8h, v16.8h\n"
+      "str q17, [x27, #0x60]\n"
+      "zip1 v17.8h, v28.8h, v21.8h\n"
+      "str q16, [x27, #0x70]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "zip1 v16.8h, v27.8h, v17.8h\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.8h, v27.8h, v17.8h\n"
+      "zip2 v17.8h, v28.8h, v21.8h\n"
+      "str q16, [x27, #0x10]\n"
+      "zip1 v16.8h, v26.8h, v17.8h\n"
+      "str q16, [x27, #0x20]\n"
+      "zip2 v16.8h, v26.8h, v17.8h\n"
+      "str q16, [x27, #0x30]\n"
+      "zip1 v17.8h, v25.8h, v20.8h\n"
+      "zip1 v16.8h, v24.8h, v17.8h\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.8h, v24.8h, v17.8h\n"
+      "str q16, [x27, #0x50]\n"
+      "zip2 v17.8h, v25.8h, v20.8h\n"
+      "zip1 v16.8h, v23.8h, v17.8h\n"
+      "str q16, [x27, #0x60]\n"
+      "zip2 v16.8h, v23.8h, v17.8h\n"
+      "str q16, [x27, #0x70]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x19, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q17, [x28], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q25, [x26], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v24.8h, v17.8h, v16.8h\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v23.8h, v17.8h, v16.8h\n"
+      "ldr q22, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v21.8h, v18.8h, v17.8h\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v20.8h, v18.8h, v17.8h\n"
+      "ldr q19, [x24], #0x10\n"
+      "zip1 v18.8h, v25.8h, v16.8h\n"
+      "zip2 v17.8h, v25.8h, v16.8h\n"
+      "zip1 v16.8h, v24.8h, v18.8h\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.8h, v24.8h, v18.8h\n"
+      "str q16, [x27, #0x10]\n"
+      "zip1 v16.8h, v23.8h, v17.8h\n"
+      "str q16, [x27, #0x20]\n"
+      "zip2 v16.8h, v23.8h, v17.8h\n"
+      "str q16, [x27, #0x30]\n"
+      "zip1 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v21.8h, v17.8h\n"
+      "str q16, [x27, #0x40]\n"
+      "zip2 v16.8h, v21.8h, v17.8h\n"
+      "str q16, [x27, #0x50]\n"
+      "zip2 v17.8h, v22.8h, v19.8h\n"
+      "zip1 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [x27, #0x60]\n"
+      "zip2 v16.8h, v20.8h, v17.8h\n"
+      "str q16, [x27, #0x70]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d18, [x28], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "ldr d17, [x26], #0x8\n"
+      "cmp x19, #0x4\n"
+      "ldr d16, [x25], #0x8\n"
+      "zip1 v18.8h, v18.8h, v16.8h\n"
+      "ldr d16, [x24], #0x8\n"
+      "zip1 v17.8h, v17.8h, v16.8h\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0x0]\n"
+      "zip2 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x27, #0x10]\n"
+      "add x27, x27, #0x20\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h17, [x28], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "ldr h18, [x26], #0x2\n"
+      "cmp x19, #0x1\n"
+      "ldr h16, [x25], #0x2\n"
+      "zip1 v17.8h, v17.8h, v16.8h\n"
+      "ldr h16, [x24], #0x2\n"
+      "zip1 v16.8h, v18.8h, v16.8h\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str d16, [x27, #0x0]\n"
+      "add x27, x27, #0x8\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x80\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 4, true, VLType::None>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_2x4(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..2ecf03c4c1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_16_2x4_fp32bf16.hpp
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_16_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 16 * roundup<size_t>(height, 4) * sizeof(bfloat16);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 8f\n"
+      "1:"  // Main row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x10\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q17, [x28], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q19, [x26], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v24.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v23.4s, v17.4s, v16.4s\n"
+      "ldr q22, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v21.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v14.4s, v18.4s, v16.4s\n"
+      "ldr q13, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v12.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v11.4s, v17.4s, v16.4s\n"
+      "ldr q10, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v9.4s, v18.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v8.4s, v18.4s, v17.4s\n"
+      "ldr q7, [x23], #0x10\n"
+      "ldr q6, [x22], #0x10\n"
+      "zip1 v17.4s, v19.4s, v16.4s\n"
+      "ldr q20, [x24], #0x10\n"
+      "zip2 v19.4s, v19.4s, v16.4s\n"
+      "ldr q5, [x23], #0x10\n"
+      "zip1 v16.4s, v24.4s, v17.4s\n"
+      "ldr q4, [x22], #0x10\n"
+      ".inst 0x0ea16a03  // bfcvtn v3.4h, v16.4s\n"
+      "ldr q2, [x21], #0x10\n"
+      "zip2 v17.4s, v24.4s, v17.4s\n"
+      "ldr q1, [x24], #0x10\n"
+      "zip1 v16.4s, v23.4s, v19.4s\n"
+      "ldr q0, [x23], #0x10\n"
+      ".inst 0x4ea16a23  // bfcvtn2 v3.8h, v17.4s\n"
+      "ldr q31, [x22], #0x10\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "ldr q30, [x21], #0x10\n"
+      "zip2 v16.4s, v23.4s, v19.4s\n"
+      "ldr q29, [x24], #0x10\n"
+      "zip1 v17.4s, v22.4s, v20.4s\n"
+      "ldr q28, [x23], #0x10\n"
+      ".inst 0x4ea16a12  // bfcvtn2 v18.8h, v16.4s\n"
+      "ldr q27, [x22], #0x10\n"
+      "zip1 v16.4s, v21.4s, v17.4s\n"
+      "ldr q26, [x21], #0x10\n"
+      ".inst 0x0ea16a19  // bfcvtn v25.4h, v16.4s\n"
+      "ldr q24, [x20], #0x10\n"
+      "zip2 v16.4s, v21.4s, v17.4s\n"
+      "ldr q23, [x21], #0x10\n"
+      ".inst 0x4ea16a19  // bfcvtn2 v25.8h, v16.4s\n"
+      "zip2 v17.4s, v22.4s, v20.4s\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v16.4s, v14.4s, v17.4s\n"
+      "ldr q21, [x20], #0x10\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v16.4s, v14.4s, v17.4s\n"
+      "ldr q20, [x20], #0x10\n"
+      ".inst 0x4ea16a13  // bfcvtn2 v19.8h, v16.4s\n"
+      "zip1 v17.4s, v13.4s, v1.4s\n"
+      "str q3, [x27, #0x0]\n"
+      "zip1 v16.4s, v12.4s, v17.4s\n"
+      "str q18, [x27, #0x10]\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "str q25, [x27, #0x20]\n"
+      "zip2 v16.4s, v12.4s, v17.4s\n"
+      "str q19, [x27, #0x30]\n"
+      "zip2 v17.4s, v13.4s, v1.4s\n"
+      ".inst 0x4ea16a12  // bfcvtn2 v18.8h, v16.4s\n"
+      "str q18, [x27, #0x40]\n"
+      "zip1 v16.4s, v11.4s, v17.4s\n"
+      "zip2 v19.4s, v11.4s, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip1 v17.4s, v10.4s, v29.4s\n"
+      "zip1 v16.4s, v9.4s, v17.4s\n"
+      ".inst 0x4ea16a72  // bfcvtn2 v18.8h, v19.4s\n"
+      "str q18, [x27, #0x50]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v9.4s, v17.4s\n"
+      "zip2 v17.4s, v10.4s, v29.4s\n"
+      "zip1 v16.4s, v8.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x60]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v16.4s, v8.4s, v17.4s\n"
+      "zip1 v18.4s, v7.4s, v2.4s\n"
+      "zip1 v17.4s, v6.4s, v24.4s\n"
+      ".inst 0x4ea16a13  // bfcvtn2 v19.8h, v16.4s\n"
+      "str q19, [x27, #0x70]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v7.4s, v2.4s\n"
+      "zip2 v17.4s, v6.4s, v24.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0x80]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip1 v18.4s, v5.4s, v30.4s\n"
+      "zip1 v17.4s, v4.4s, v22.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0x90]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v5.4s, v30.4s\n"
+      "zip2 v17.4s, v4.4s, v22.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0xa0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip1 v18.4s, v0.4s, v26.4s\n"
+      "zip1 v17.4s, v31.4s, v21.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0xb0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v0.4s, v26.4s\n"
+      "zip2 v17.4s, v31.4s, v21.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0xc0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip1 v18.4s, v28.4s, v23.4s\n"
+      "zip1 v17.4s, v27.4s, v20.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0xd0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v28.4s, v23.4s\n"
+      "zip2 v17.4s, v27.4s, v20.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0xe0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v17.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x4ea16a30  // bfcvtn2 v16.8h, v17.4s\n"
+      "str q16, [x27, #0xf0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 4 loop: loop
+      "ldr q20, [x28], #0x10\n"
+      "sub x19, x19, #0x4\n"
+      "ldr q18, [x26], #0x10\n"
+      "cmp x19, #0x4\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v19.4s, v20.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v25.4s, v20.4s, v17.4s\n"
+      "ldr q24, [x23], #0x10\n"
+      "ldr q23, [x22], #0x10\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "ldr q22, [x21], #0x10\n"
+      "zip2 v21.4s, v18.4s, v16.4s\n"
+      "ldr q20, [x20], #0x10\n"
+      "zip1 v16.4s, v19.4s, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip2 v17.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v25.4s, v21.4s\n"
+      ".inst 0x4ea16a32  // bfcvtn2 v18.8h, v17.4s\n"
+      "str q18, [x27, #0x0]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v16.4s, v25.4s, v21.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v20.4s\n"
+      ".inst 0x4ea16a13  // bfcvtn2 v19.8h, v16.4s\n"
+      "str q19, [x27, #0x10]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v20.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0x80]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v17.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x4ea16a30  // bfcvtn2 v16.8h, v17.4s\n"
+      "str q16, [x27, #0x90]\n"
+      "add x27, x27, #0x20\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 1 loop: loop
+      "ldr s18, [x28], #0x4\n"
+      "sub x19, x19, #0x1\n"
+      "ldr s17, [x26], #0x4\n"
+      "cmp x19, #0x1\n"
+      "ldr s16, [x25], #0x4\n"
+      "zip1 v18.4s, v18.4s, v16.4s\n"
+      "ldr s16, [x24], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s17, [x21], #0x4\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "ldr s18, [x20], #0x4\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip1 v17.4s, v20.4s, v17.4s\n"
+      "str d16, [x27, #0x0]\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d16, [x27, #0x80]\n"
+      "add x27, x27, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x100\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 16f\n"
+      "8:"  // Main loop skip
+
+      "9:"  // Tail row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x10\n"
+      "blt 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "ldr q17, [x28], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q20, [x26], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v19.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v1.4s, v17.4s, v16.4s\n"
+      "ldr q0, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v31.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v30.4s, v18.4s, v16.4s\n"
+      "ldr q29, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v28.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v27.4s, v17.4s, v16.4s\n"
+      "ldr q26, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v25.4s, v18.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v24.4s, v18.4s, v17.4s\n"
+      "ldr q23, [x24], #0x10\n"
+      "zip1 v17.4s, v20.4s, v16.4s\n"
+      "zip2 v22.4s, v20.4s, v16.4s\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v16.4s, v19.4s, v17.4s\n"
+      "zip2 v19.4s, v19.4s, v17.4s\n"
+      "ldr q20, [x24], #0x10\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip1 v16.4s, v1.4s, v22.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip2 v16.4s, v1.4s, v22.4s\n"
+      ".inst 0x4ea16a71  // bfcvtn2 v17.8h, v19.4s\n"
+      "str q17, [x27, #0x0]\n"
+      ".inst 0x4ea16a12  // bfcvtn2 v18.8h, v16.4s\n"
+      "zip1 v17.4s, v0.4s, v23.4s\n"
+      "str q18, [x27, #0x10]\n"
+      "zip1 v16.4s, v31.4s, v17.4s\n"
+      "zip2 v19.4s, v31.4s, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip2 v17.4s, v0.4s, v23.4s\n"
+      "zip1 v16.4s, v30.4s, v17.4s\n"
+      ".inst 0x4ea16a72  // bfcvtn2 v18.8h, v19.4s\n"
+      "str q18, [x27, #0x20]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v30.4s, v17.4s\n"
+      "zip1 v17.4s, v29.4s, v21.4s\n"
+      "zip1 v16.4s, v28.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x30]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v28.4s, v17.4s\n"
+      "zip2 v17.4s, v29.4s, v21.4s\n"
+      "zip1 v16.4s, v27.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x40]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v27.4s, v17.4s\n"
+      "zip1 v17.4s, v26.4s, v20.4s\n"
+      "zip1 v16.4s, v25.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x50]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v17.4s, v25.4s, v17.4s\n"
+      "zip2 v18.4s, v26.4s, v20.4s\n"
+      "zip1 v16.4s, v24.4s, v18.4s\n"
+      ".inst 0x4ea16a33  // bfcvtn2 v19.8h, v17.4s\n"
+      "str q19, [x27, #0x60]\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v24.4s, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q17, [x27, #0x70]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: width 4 loop: loop
+      "ldr q19, [x28], #0x10\n"
+      "sub x19, x19, #0x4\n"
+      "ldr q18, [x26], #0x10\n"
+      "cmp x19, #0x4\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v21.4s, v19.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v20.4s, v19.4s, v17.4s\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "zip2 v19.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v21.4s, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip2 v17.4s, v21.4s, v17.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      ".inst 0x4ea16a32  // bfcvtn2 v18.8h, v17.4s\n"
+      "str q18, [x27, #0x0]\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q17, [x27, #0x10]\n"
+      "add x27, x27, #0x20\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 1 loop: loop
+      "ldr s17, [x28], #0x4\n"
+      "sub x19, x19, #0x1\n"
+      "ldr s18, [x26], #0x4\n"
+      "cmp x19, #0x1\n"
+      "ldr s16, [x25], #0x4\n"
+      "zip1 v17.4s, v17.4s, v16.4s\n"
+      "ldr s16, [x24], #0x4\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d16, [x27, #0x0]\n"
+      "add x27, x27, #0x8\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x80\n"
+      "cmp %x[height], #0x1\n"
+      "bge 9b\n"
+      "16:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<16, 4, true, VLType::None>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_16_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
new file mode 100644
index 0000000000..9f3ab95108
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24.hpp
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_24(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 12 * height * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add x22, x24, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q29, [x24], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "ldr q18, [x22], #0x10\n"
+      "dup v28.2d, v18.d[1]\n"
+      "ldr q16, [x24], #0x10\n"
+      "cmp x19, #0x18\n"
+      "dup v27.2d, v16.d[0]\n"
+      "ldr q17, [x24], #0x10\n"
+      "dup v26.2d, v16.d[1]\n"
+      "ldr q16, [x22], #0x10\n"
+      "mov v27.d[1], v18.d[0]\n"
+      "ldr q25, [x21], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "mov v26.d[1], v17.d[0]\n"
+      "ldr q23, [x22], #0x10\n"
+      "mov v28.d[1], v16.d[0]\n"
+      "dup v22.2d, v17.d[1]\n"
+      "ldr q17, [x21], #0x10\n"
+      "dup v21.2d, v24.d[1]\n"
+      "ldr q20, [x20], #0x10\n"
+      "mov v22.d[1], v16.d[1]\n"
+      "ldr q16, [x21], #0x10\n"
+      "dup v19.2d, v17.d[0]\n"
+      "dup v18.2d, v17.d[1]\n"
+      "ldr q17, [x20], #0x10\n"
+      "mov v19.d[1], v24.d[0]\n"
+      "str q29, [x23, #0x0]\n"
+      "mov v21.d[1], v20.d[0]\n"
+      "str q27, [x23, #0x10]\n"
+      "str q28, [x23, #0x20]\n"
+      "mov v18.d[1], v16.d[0]\n"
+      "dup v16.2d, v16.d[1]\n"
+      "str q25, [x23, #0x30]\n"
+      "mov v16.d[1], v20.d[1]\n"
+      "str q19, [x23, #0x40]\n"
+      "str q21, [x23, #0x50]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "str q26, [x23, #0x0]\n"
+      "str q22, [x23, #0x10]\n"
+      "str q23, [x23, #0x20]\n"
+      "str q18, [x23, #0x30]\n"
+      "str q16, [x23, #0x40]\n"
+      "str q17, [x23, #0x50]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q22, [x24], #0x10\n"
+      "sub x19, x19, #0xc\n"
+      "ldr q16, [x22], #0x10\n"
+      "dup v21.2d, v16.d[1]\n"
+      "ldr d20, [x24], #0x8\n"
+      "cmp x19, #0xc\n"
+      "mov v20.d[1], v16.d[0]\n"
+      "ldr d16, [x22], #0x8\n"
+      "ldr q19, [x21], #0x10\n"
+      "mov v21.d[1], v16.d[0]\n"
+      "ldr d18, [x21], #0x8\n"
+      "ldr q16, [x20], #0x10\n"
+      "mov v18.d[1], v16.d[0]\n"
+      "ldr d17, [x20], #0x8\n"
+      "dup v16.2d, v16.d[1]\n"
+      "str q22, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "mov v16.d[1], v17.d[0]\n"
+      "str q21, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q18, [x23, #0x40]\n"
+      "str q16, [x23, #0x50]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x24], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "ldr d18, [x22], #0x8\n"
+      "cmp x19, #0x4\n"
+      "ldr d17, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "str d19, [x23, #0x0]\n"
+      "str d18, [x23, #0x18]\n"
+      "str d17, [x23, #0x30]\n"
+      "str d16, [x23, #0x48]\n"
+      "add x23, x23, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x24], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "ldr h18, [x22], #0x2\n"
+      "cmp x19, #0x1\n"
+      "ldr h17, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "str h19, [x23, #0x0]\n"
+      "str h18, [x23, #0x18]\n"
+      "str h17, [x23, #0x30]\n"
+      "str h16, [x23, #0x48]\n"
+      "add x23, x23, #0x2\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x60\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q19, [x24], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "cmp x19, #0x18\n"
+      "ldr q16, [x24], #0x10\n"
+      "dup v18.2d, v16.d[0]\n"
+      "ldr q17, [x24], #0x10\n"
+      "dup v16.2d, v16.d[1]\n"
+      "str q19, [x23, #0x0]\n"
+      "str d18, [x23, #0x10]\n"
+      "mov v16.d[1], v17.d[0]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "str q16, [x23, #0x0]\n"
+      "dup v16.2d, v17.d[1]\n"
+      "str d16, [x23, #0x10]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q17, [x24], #0x10\n"
+      "sub x19, x19, #0xc\n"
+      "cmp x19, #0xc\n"
+      "ldr d16, [x24], #0x8\n"
+      "str q17, [x23, #0x0]\n"
+      "str d16, [x23, #0x10]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d16, [x24], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "cmp x19, #0x4\n"
+      "str d16, [x23, #0x0]\n"
+      "add x23, x23, #0x8\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h16, [x24], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "cmp x19, #0x1\n"
+      "str h16, [x23, #0x0]\n"
+      "add x23, x23, #0x2\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x18\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<6, 1, true, VLType::None>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_24(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<12, 1, true, VLType::None>(
+    int16_t *out, const int16_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_24(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int16_t) / 2,
+        stride * sizeof(int16_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<12, 1, true, VLType::None>(
+    uint16_t *out, const uint16_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_24(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint16_t) / 2,
+        stride * sizeof(uint16_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..101be7e843
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_2x4_fp32bf16.hpp
@@ -0,0 +1,787 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_24_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 24 * roundup<size_t>(height, 4) * sizeof(bfloat16);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x8\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q3, [x28], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "ldr q27, [x26], #0x10\n"
+      "cmp x19, #0x18\n"
+      "ldr q26, [x25], #0x10\n"
+      "zip1 v28.4s, v3.4s, v26.4s\n"
+      "ldr q1, [x28], #0x10\n"
+      "zip2 v12.4s, v3.4s, v26.4s\n"
+      "ldr q26, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v11.4s, v1.4s, v17.4s\n"
+      "ldr q31, [x28], #0x10\n"
+      "zip2 v23.4s, v1.4s, v17.4s\n"
+      "ldr q3, [x26], #0x10\n"
+      "ldr q1, [x25], #0x10\n"
+      "zip1 v10.4s, v31.4s, v1.4s\n"
+      "ldr q14, [x28], #0x10\n"
+      "zip2 v17.4s, v31.4s, v1.4s\n"
+      "ldr q6, [x26], #0x10\n"
+      "ldr q1, [x25], #0x10\n"
+      "zip1 v5.4s, v14.4s, v1.4s\n"
+      "ldr q0, [x28], #0x10\n"
+      "zip2 v8.4s, v14.4s, v1.4s\n"
+      "ldr q2, [x26], #0x10\n"
+      "ldr q30, [x25], #0x10\n"
+      "zip1 v15.4s, v0.4s, v30.4s\n"
+      "ldr q14, [x28], #0x10\n"
+      "zip2 v1.4s, v0.4s, v30.4s\n"
+      "ldr q0, [x26], #0x10\n"
+      "ldr q29, [x25], #0x10\n"
+      "zip1 v19.4s, v14.4s, v29.4s\n"
+      "ldr q25, [x24], #0x10\n"
+      "zip2 v30.4s, v14.4s, v29.4s\n"
+      "ldr q7, [x23], #0x10\n"
+      "ldr q31, [x22], #0x10\n"
+      "zip1 v20.4s, v27.4s, v25.4s\n"
+      "ldr q24, [x24], #0x10\n"
+      "zip2 v4.4s, v27.4s, v25.4s\n"
+      "ldr q22, [x23], #0x10\n"
+      "zip1 v14.4s, v28.4s, v20.4s\n"
+      "ldr q13, [x22], #0x10\n"
+      ".inst 0x0ea169d0  // bfcvtn v16.4h, v14.4s\n"
+      "ldr q29, [x21], #0x10\n"
+      "zip2 v21.4s, v28.4s, v20.4s\n"
+      "ldr q27, [x24], #0x10\n"
+      "zip1 v9.4s, v12.4s, v4.4s\n"
+      "ldr q14, [x23], #0x10\n"
+      ".inst 0x4ea16ab0  // bfcvtn2 v16.8h, v21.4s\n"
+      "ldr q21, [x22], #0x10\n"
+      ".inst 0x0ea16929  // bfcvtn v9.4h, v9.4s\n"
+      "ldr q18, [x21], #0x10\n"
+      "zip2 v25.4s, v12.4s, v4.4s\n"
+      "ldr q4, [x24], #0x10\n"
+      "zip1 v28.4s, v26.4s, v24.4s\n"
+      "ldr q20, [x23], #0x10\n"
+      ".inst 0x4ea16b29  // bfcvtn2 v9.8h, v25.4s\n"
+      "ldr q12, [x22], #0x10\n"
+      "zip1 v25.4s, v11.4s, v28.4s\n"
+      ".inst 0x0ea16b39  // bfcvtn v25.4h, v25.4s\n"
+      "zip2 v11.4s, v11.4s, v28.4s\n"
+      "ldr q28, [x24], #0x10\n"
+      "zip2 v26.4s, v26.4s, v24.4s\n"
+      "ldr q24, [x23], #0x10\n"
+      ".inst 0x4ea16979  // bfcvtn2 v25.8h, v11.4s\n"
+      "zip1 v11.4s, v23.4s, v26.4s\n"
+      ".inst 0x0ea1696b  // bfcvtn v11.4h, v11.4s\n"
+      "zip2 v23.4s, v23.4s, v26.4s\n"
+      "ldr q26, [x24], #0x10\n"
+      ".inst 0x4ea16aeb  // bfcvtn2 v11.8h, v23.4s\n"
+      "zip1 v23.4s, v3.4s, v27.4s\n"
+      "zip2 v27.4s, v3.4s, v27.4s\n"
+      "zip1 v3.4s, v10.4s, v23.4s\n"
+      ".inst 0x0ea16863  // bfcvtn v3.4h, v3.4s\n"
+      "zip2 v10.4s, v10.4s, v23.4s\n"
+      "ldr q23, [x23], #0x10\n"
+      ".inst 0x4ea16943  // bfcvtn2 v3.8h, v10.4s\n"
+      "zip1 v10.4s, v17.4s, v27.4s\n"
+      ".inst 0x0ea1694a  // bfcvtn v10.4h, v10.4s\n"
+      "zip2 v27.4s, v17.4s, v27.4s\n"
+      "ldr q17, [x22], #0x10\n"
+      ".inst 0x4ea16b6a  // bfcvtn2 v10.8h, v27.4s\n"
+      "zip1 v27.4s, v6.4s, v4.4s\n"
+      "zip2 v6.4s, v6.4s, v4.4s\n"
+      "zip1 v4.4s, v5.4s, v27.4s\n"
+      ".inst 0x0ea16884  // bfcvtn v4.4h, v4.4s\n"
+      "zip2 v27.4s, v5.4s, v27.4s\n"
+      "ldr q5, [x22], #0x10\n"
+      ".inst 0x4ea16b64  // bfcvtn2 v4.8h, v27.4s\n"
+      "zip1 v27.4s, v8.4s, v6.4s\n"
+      ".inst 0x0ea16b7b  // bfcvtn v27.4h, v27.4s\n"
+      "zip2 v6.4s, v8.4s, v6.4s\n"
+      "ldr q8, [x21], #0x10\n"
+      ".inst 0x4ea168db  // bfcvtn2 v27.8h, v6.4s\n"
+      "zip1 v6.4s, v2.4s, v28.4s\n"
+      "zip2 v2.4s, v2.4s, v28.4s\n"
+      "zip1 v28.4s, v15.4s, v6.4s\n"
+      ".inst 0x0ea16b9c  // bfcvtn v28.4h, v28.4s\n"
+      "zip2 v6.4s, v15.4s, v6.4s\n"
+      "ldr q15, [x21], #0x10\n"
+      ".inst 0x4ea168dc  // bfcvtn2 v28.8h, v6.4s\n"
+      "zip1 v6.4s, v1.4s, v2.4s\n"
+      ".inst 0x0ea168c6  // bfcvtn v6.4h, v6.4s\n"
+      "zip2 v2.4s, v1.4s, v2.4s\n"
+      "ldr q1, [x21], #0x10\n"
+      ".inst 0x4ea16846  // bfcvtn2 v6.8h, v2.4s\n"
+      "zip1 v2.4s, v0.4s, v26.4s\n"
+      "zip2 v26.4s, v0.4s, v26.4s\n"
+      "zip1 v0.4s, v19.4s, v2.4s\n"
+      ".inst 0x0ea16800  // bfcvtn v0.4h, v0.4s\n"
+      "zip2 v19.4s, v19.4s, v2.4s\n"
+      "ldr q2, [x21], #0x10\n"
+      ".inst 0x4ea16a60  // bfcvtn2 v0.8h, v19.4s\n"
+      "zip1 v19.4s, v30.4s, v26.4s\n"
+      ".inst 0x0ea16a73  // bfcvtn v19.4h, v19.4s\n"
+      "zip2 v30.4s, v30.4s, v26.4s\n"
+      "ldr q26, [x20], #0x10\n"
+      ".inst 0x4ea16bd3  // bfcvtn2 v19.8h, v30.4s\n"
+      "zip1 v30.4s, v7.4s, v29.4s\n"
+      "zip2 v29.4s, v7.4s, v29.4s\n"
+      "zip1 v7.4s, v22.4s, v18.4s\n"
+      "zip2 v18.4s, v22.4s, v18.4s\n"
+      "zip1 v22.4s, v31.4s, v26.4s\n"
+      "zip2 v26.4s, v31.4s, v26.4s\n"
+      "zip1 v31.4s, v30.4s, v22.4s\n"
+      ".inst 0x0ea16bff  // bfcvtn v31.4h, v31.4s\n"
+      "zip2 v30.4s, v30.4s, v22.4s\n"
+      "ldr q22, [x20], #0x10\n"
+      ".inst 0x4ea16bdf  // bfcvtn2 v31.8h, v30.4s\n"
+      "zip1 v30.4s, v29.4s, v26.4s\n"
+      ".inst 0x0ea16bde  // bfcvtn v30.4h, v30.4s\n"
+      "zip2 v26.4s, v29.4s, v26.4s\n"
+      "ldr q29, [x20], #0x10\n"
+      ".inst 0x4ea16b5e  // bfcvtn2 v30.8h, v26.4s\n"
+      "zip1 v26.4s, v13.4s, v22.4s\n"
+      "zip2 v13.4s, v13.4s, v22.4s\n"
+      "zip1 v22.4s, v7.4s, v26.4s\n"
+      ".inst 0x0ea16ad6  // bfcvtn v22.4h, v22.4s\n"
+      "zip2 v7.4s, v7.4s, v26.4s\n"
+      "ldr q26, [x20], #0x10\n"
+      ".inst 0x4ea168f6  // bfcvtn2 v22.8h, v7.4s\n"
+      "zip1 v7.4s, v18.4s, v13.4s\n"
+      ".inst 0x0ea168e7  // bfcvtn v7.4h, v7.4s\n"
+      "zip2 v13.4s, v18.4s, v13.4s\n"
+      "ldr q18, [x20], #0x10\n"
+      ".inst 0x4ea169a7  // bfcvtn2 v7.8h, v13.4s\n"
+      "ldr q13, [x20], #0x10\n"
+      "str q16, [x27, #0x0]\n"
+      "zip1 v16.4s, v14.4s, v8.4s\n"
+      "zip2 v8.4s, v14.4s, v8.4s\n"
+      "str q9, [x27, #0x10]\n"
+      "zip1 v9.4s, v21.4s, v29.4s\n"
+      "str q25, [x27, #0x20]\n"
+      "zip1 v25.4s, v16.4s, v9.4s\n"
+      "str q11, [x27, #0x30]\n"
+      ".inst 0x0ea16b2e  // bfcvtn v14.4h, v25.4s\n"
+      "str q3, [x27, #0x40]\n"
+      "zip2 v25.4s, v16.4s, v9.4s\n"
+      "str q10, [x27, #0x50]\n"
+      "zip2 v29.4s, v21.4s, v29.4s\n"
+      "str q4, [x27, #0x60]\n"
+      ".inst 0x4ea16b2e  // bfcvtn2 v14.8h, v25.4s\n"
+      "str q27, [x27, #0x70]\n"
+      "zip1 v27.4s, v8.4s, v29.4s\n"
+      "str q28, [x27, #0x80]\n"
+      ".inst 0x0ea16b7b  // bfcvtn v27.4h, v27.4s\n"
+      "str q6, [x27, #0x90]\n"
+      "zip2 v16.4s, v8.4s, v29.4s\n"
+      "str q0, [x27, #0xa0]\n"
+      "zip1 v0.4s, v20.4s, v15.4s\n"
+      "str q19, [x27, #0xb0]\n"
+      ".inst 0x4ea16a1b  // bfcvtn2 v27.8h, v16.4s\n"
+      "str q31, [x27, #0xc0]\n"
+      "zip1 v29.4s, v12.4s, v26.4s\n"
+      "str q30, [x27, #0xd0]\n"
+      "zip1 v28.4s, v0.4s, v29.4s\n"
+      "str q22, [x27, #0xe0]\n"
+      ".inst 0x0ea16b83  // bfcvtn v3.4h, v28.4s\n"
+      "str q7, [x27, #0xf0]\n"
+      "zip2 v22.4s, v0.4s, v29.4s\n"
+      "str q14, [x27, #0x100]\n"
+      "zip2 v19.4s, v20.4s, v15.4s\n"
+      "str q27, [x27, #0x110]\n"
+      ".inst 0x4ea16ac3  // bfcvtn2 v3.8h, v22.4s\n"
+      "str q3, [x27, #0x120]\n"
+      "zip2 v4.4s, v12.4s, v26.4s\n"
+      "zip1 v20.4s, v24.4s, v1.4s\n"
+      "zip1 v22.4s, v19.4s, v4.4s\n"
+      ".inst 0x0ea16ad9  // bfcvtn v25.4h, v22.4s\n"
+      "zip2 v6.4s, v19.4s, v4.4s\n"
+      "zip1 v22.4s, v17.4s, v18.4s\n"
+      ".inst 0x4ea168d9  // bfcvtn2 v25.8h, v6.4s\n"
+      "str q25, [x27, #0x130]\n"
+      "zip1 v3.4s, v20.4s, v22.4s\n"
+      "zip2 v22.4s, v20.4s, v22.4s\n"
+      ".inst 0x0ea16864  // bfcvtn v4.4h, v3.4s\n"
+      "zip2 v15.4s, v24.4s, v1.4s\n"
+      "zip2 v17.4s, v17.4s, v18.4s\n"
+      ".inst 0x4ea16ac4  // bfcvtn2 v4.8h, v22.4s\n"
+      "str q4, [x27, #0x140]\n"
+      "zip1 v16.4s, v15.4s, v17.4s\n"
+      "zip2 v8.4s, v15.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip1 v18.4s, v23.4s, v2.4s\n"
+      "zip1 v17.4s, v5.4s, v13.4s\n"
+      ".inst 0x4ea16910  // bfcvtn2 v16.8h, v8.4s\n"
+      "str q16, [x27, #0x150]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v10.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v23.4s, v2.4s\n"
+      "zip2 v17.4s, v5.4s, v13.4s\n"
+      ".inst 0x4ea16950  // bfcvtn2 v16.8h, v10.4s\n"
+      "str q16, [x27, #0x160]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v17.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x4ea16a30  // bfcvtn2 v16.8h, v17.4s\n"
+      "str q16, [x27, #0x170]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x19, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 16 loop: loop
+      "ldr q17, [x28], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q19, [x26], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v24.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v23.4s, v17.4s, v16.4s\n"
+      "ldr q22, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v21.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v14.4s, v18.4s, v16.4s\n"
+      "ldr q13, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v12.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v11.4s, v17.4s, v16.4s\n"
+      "ldr q10, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v9.4s, v18.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v8.4s, v18.4s, v17.4s\n"
+      "ldr q7, [x23], #0x10\n"
+      "ldr q6, [x22], #0x10\n"
+      "zip1 v17.4s, v19.4s, v16.4s\n"
+      "ldr q20, [x24], #0x10\n"
+      "zip2 v19.4s, v19.4s, v16.4s\n"
+      "ldr q5, [x23], #0x10\n"
+      "zip1 v16.4s, v24.4s, v17.4s\n"
+      "ldr q4, [x22], #0x10\n"
+      ".inst 0x0ea16a03  // bfcvtn v3.4h, v16.4s\n"
+      "ldr q2, [x21], #0x10\n"
+      "zip2 v17.4s, v24.4s, v17.4s\n"
+      "ldr q1, [x24], #0x10\n"
+      "zip1 v16.4s, v23.4s, v19.4s\n"
+      "ldr q0, [x23], #0x10\n"
+      ".inst 0x4ea16a23  // bfcvtn2 v3.8h, v17.4s\n"
+      "ldr q31, [x22], #0x10\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "ldr q30, [x21], #0x10\n"
+      "zip2 v16.4s, v23.4s, v19.4s\n"
+      "ldr q29, [x24], #0x10\n"
+      "zip1 v17.4s, v22.4s, v20.4s\n"
+      "ldr q28, [x23], #0x10\n"
+      ".inst 0x4ea16a12  // bfcvtn2 v18.8h, v16.4s\n"
+      "ldr q27, [x22], #0x10\n"
+      "zip1 v16.4s, v21.4s, v17.4s\n"
+      "ldr q26, [x21], #0x10\n"
+      ".inst 0x0ea16a19  // bfcvtn v25.4h, v16.4s\n"
+      "ldr q24, [x20], #0x10\n"
+      "zip2 v16.4s, v21.4s, v17.4s\n"
+      "ldr q23, [x21], #0x10\n"
+      ".inst 0x4ea16a19  // bfcvtn2 v25.8h, v16.4s\n"
+      "zip2 v17.4s, v22.4s, v20.4s\n"
+      "ldr q22, [x20], #0x10\n"
+      "zip1 v16.4s, v14.4s, v17.4s\n"
+      "ldr q21, [x20], #0x10\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v16.4s, v14.4s, v17.4s\n"
+      "ldr q20, [x20], #0x10\n"
+      ".inst 0x4ea16a13  // bfcvtn2 v19.8h, v16.4s\n"
+      "zip1 v17.4s, v13.4s, v1.4s\n"
+      "str q3, [x27, #0x0]\n"
+      "zip1 v16.4s, v12.4s, v17.4s\n"
+      "str q18, [x27, #0x10]\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "str q25, [x27, #0x20]\n"
+      "zip2 v16.4s, v12.4s, v17.4s\n"
+      "str q19, [x27, #0x30]\n"
+      "zip2 v17.4s, v13.4s, v1.4s\n"
+      ".inst 0x4ea16a12  // bfcvtn2 v18.8h, v16.4s\n"
+      "str q18, [x27, #0x40]\n"
+      "zip1 v16.4s, v11.4s, v17.4s\n"
+      "zip2 v19.4s, v11.4s, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip1 v17.4s, v10.4s, v29.4s\n"
+      "zip1 v16.4s, v9.4s, v17.4s\n"
+      ".inst 0x4ea16a72  // bfcvtn2 v18.8h, v19.4s\n"
+      "str q18, [x27, #0x50]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v9.4s, v17.4s\n"
+      "zip2 v17.4s, v10.4s, v29.4s\n"
+      "zip1 v16.4s, v8.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x60]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v16.4s, v8.4s, v17.4s\n"
+      "zip1 v18.4s, v7.4s, v2.4s\n"
+      "zip1 v17.4s, v6.4s, v24.4s\n"
+      ".inst 0x4ea16a13  // bfcvtn2 v19.8h, v16.4s\n"
+      "str q19, [x27, #0x70]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v7.4s, v2.4s\n"
+      "zip2 v17.4s, v6.4s, v24.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0xc0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip1 v18.4s, v5.4s, v30.4s\n"
+      "zip1 v17.4s, v4.4s, v22.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0xd0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v5.4s, v30.4s\n"
+      "zip2 v17.4s, v4.4s, v22.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0xe0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip1 v18.4s, v0.4s, v26.4s\n"
+      "zip1 v17.4s, v31.4s, v21.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0xf0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v0.4s, v26.4s\n"
+      "zip2 v17.4s, v31.4s, v21.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0x100]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip1 v18.4s, v28.4s, v23.4s\n"
+      "zip1 v17.4s, v27.4s, v20.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0x110]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v28.4s, v23.4s\n"
+      "zip2 v17.4s, v27.4s, v20.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0x120]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v17.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x4ea16a30  // bfcvtn2 v16.8h, v17.4s\n"
+      "str q16, [x27, #0x130]\n"
+      "add x27, x27, #0x80\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 16 loop: skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr q20, [x28], #0x10\n"
+      "sub x19, x19, #0x4\n"
+      "ldr q18, [x26], #0x10\n"
+      "cmp x19, #0x4\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v19.4s, v20.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v25.4s, v20.4s, v17.4s\n"
+      "ldr q24, [x23], #0x10\n"
+      "ldr q23, [x22], #0x10\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "ldr q22, [x21], #0x10\n"
+      "zip2 v21.4s, v18.4s, v16.4s\n"
+      "ldr q20, [x20], #0x10\n"
+      "zip1 v16.4s, v19.4s, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip2 v17.4s, v19.4s, v17.4s\n"
+      "zip1 v16.4s, v25.4s, v21.4s\n"
+      ".inst 0x4ea16a32  // bfcvtn2 v18.8h, v17.4s\n"
+      "str q18, [x27, #0x0]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v16.4s, v25.4s, v21.4s\n"
+      "zip1 v18.4s, v24.4s, v22.4s\n"
+      "zip1 v17.4s, v23.4s, v20.4s\n"
+      ".inst 0x4ea16a13  // bfcvtn2 v19.8h, v16.4s\n"
+      "str q19, [x27, #0x10]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v19.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip2 v18.4s, v24.4s, v22.4s\n"
+      "zip2 v17.4s, v23.4s, v20.4s\n"
+      ".inst 0x4ea16a70  // bfcvtn2 v16.8h, v19.4s\n"
+      "str q16, [x27, #0xc0]\n"
+      "zip1 v16.4s, v18.4s, v17.4s\n"
+      "zip2 v17.4s, v18.4s, v17.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      ".inst 0x4ea16a30  // bfcvtn2 v16.8h, v17.4s\n"
+      "str q16, [x27, #0xd0]\n"
+      "add x27, x27, #0x20\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr s18, [x28], #0x4\n"
+      "sub x19, x19, #0x1\n"
+      "ldr s17, [x26], #0x4\n"
+      "cmp x19, #0x1\n"
+      "ldr s16, [x25], #0x4\n"
+      "zip1 v18.4s, v18.4s, v16.4s\n"
+      "ldr s16, [x24], #0x4\n"
+      "ldr s20, [x23], #0x4\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      "ldr s19, [x22], #0x4\n"
+      "ldr s17, [x21], #0x4\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "ldr s18, [x20], #0x4\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "zip1 v17.4s, v20.4s, v17.4s\n"
+      "str d16, [x27, #0x0]\n"
+      "zip1 v16.4s, v19.4s, v18.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d16, [x27, #0xc0]\n"
+      "add x27, x27, #0x8\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x180\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x28, %x[in]\n"
+      "mov x27, %x[out]\n"
+      "add x26, x28, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q17, [x28], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "ldr q20, [x26], #0x10\n"
+      "cmp x19, #0x18\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v19.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v9.4s, v17.4s, v16.4s\n"
+      "ldr q8, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v7.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v6.4s, v18.4s, v16.4s\n"
+      "ldr q5, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v4.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v3.4s, v17.4s, v16.4s\n"
+      "ldr q2, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v1.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v0.4s, v18.4s, v16.4s\n"
+      "ldr q31, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v30.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v29.4s, v17.4s, v16.4s\n"
+      "ldr q28, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v27.4s, v18.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v26.4s, v18.4s, v17.4s\n"
+      "ldr q25, [x24], #0x10\n"
+      "zip1 v17.4s, v20.4s, v16.4s\n"
+      "zip2 v24.4s, v20.4s, v16.4s\n"
+      "ldr q23, [x24], #0x10\n"
+      "zip1 v16.4s, v19.4s, v17.4s\n"
+      "zip2 v17.4s, v19.4s, v17.4s\n"
+      "ldr q22, [x24], #0x10\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip1 v16.4s, v9.4s, v24.4s\n"
+      "ldr q21, [x24], #0x10\n"
+      ".inst 0x4ea16a33  // bfcvtn2 v19.8h, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "ldr q20, [x24], #0x10\n"
+      "zip2 v16.4s, v9.4s, v24.4s\n"
+      "zip1 v17.4s, v8.4s, v25.4s\n"
+      "str q19, [x27, #0x0]\n"
+      ".inst 0x4ea16a12  // bfcvtn2 v18.8h, v16.4s\n"
+      "str q18, [x27, #0x10]\n"
+      "zip1 v16.4s, v7.4s, v17.4s\n"
+      "zip2 v19.4s, v7.4s, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip2 v17.4s, v8.4s, v25.4s\n"
+      "zip1 v16.4s, v6.4s, v17.4s\n"
+      ".inst 0x4ea16a72  // bfcvtn2 v18.8h, v19.4s\n"
+      "str q18, [x27, #0x20]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v6.4s, v17.4s\n"
+      "zip1 v17.4s, v5.4s, v23.4s\n"
+      "zip1 v16.4s, v4.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x30]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v4.4s, v17.4s\n"
+      "zip2 v17.4s, v5.4s, v23.4s\n"
+      "zip1 v16.4s, v3.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x40]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v3.4s, v17.4s\n"
+      "zip1 v17.4s, v2.4s, v22.4s\n"
+      "zip1 v16.4s, v1.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x50]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v1.4s, v17.4s\n"
+      "zip2 v17.4s, v2.4s, v22.4s\n"
+      "zip1 v16.4s, v0.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x60]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v0.4s, v17.4s\n"
+      "zip1 v17.4s, v31.4s, v21.4s\n"
+      "zip1 v16.4s, v30.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x70]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v30.4s, v17.4s\n"
+      "zip2 v17.4s, v31.4s, v21.4s\n"
+      "zip1 v16.4s, v29.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x80]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v29.4s, v17.4s\n"
+      "zip1 v17.4s, v28.4s, v20.4s\n"
+      "zip1 v16.4s, v27.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x90]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v17.4s, v27.4s, v17.4s\n"
+      "zip2 v18.4s, v28.4s, v20.4s\n"
+      "zip1 v16.4s, v26.4s, v18.4s\n"
+      ".inst 0x4ea16a33  // bfcvtn2 v19.8h, v17.4s\n"
+      "str q19, [x27, #0xa0]\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v26.4s, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q17, [x27, #0xb0]\n"
+      "add x27, x27, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 16 loop: loop
+      "ldr q17, [x28], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q20, [x26], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v19.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v1.4s, v17.4s, v16.4s\n"
+      "ldr q0, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v31.4s, v18.4s, v16.4s\n"
+      "ldr q17, [x28], #0x10\n"
+      "zip2 v30.4s, v18.4s, v16.4s\n"
+      "ldr q29, [x26], #0x10\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip1 v28.4s, v17.4s, v16.4s\n"
+      "ldr q18, [x28], #0x10\n"
+      "zip2 v27.4s, v17.4s, v16.4s\n"
+      "ldr q26, [x26], #0x10\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v25.4s, v18.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v24.4s, v18.4s, v17.4s\n"
+      "ldr q23, [x24], #0x10\n"
+      "zip1 v17.4s, v20.4s, v16.4s\n"
+      "zip2 v22.4s, v20.4s, v16.4s\n"
+      "ldr q21, [x24], #0x10\n"
+      "zip1 v16.4s, v19.4s, v17.4s\n"
+      "zip2 v19.4s, v19.4s, v17.4s\n"
+      "ldr q20, [x24], #0x10\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip1 v16.4s, v1.4s, v22.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip2 v16.4s, v1.4s, v22.4s\n"
+      ".inst 0x4ea16a71  // bfcvtn2 v17.8h, v19.4s\n"
+      "str q17, [x27, #0x0]\n"
+      ".inst 0x4ea16a12  // bfcvtn2 v18.8h, v16.4s\n"
+      "zip1 v17.4s, v0.4s, v23.4s\n"
+      "str q18, [x27, #0x10]\n"
+      "zip1 v16.4s, v31.4s, v17.4s\n"
+      "zip2 v19.4s, v31.4s, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip2 v17.4s, v0.4s, v23.4s\n"
+      "zip1 v16.4s, v30.4s, v17.4s\n"
+      ".inst 0x4ea16a72  // bfcvtn2 v18.8h, v19.4s\n"
+      "str q18, [x27, #0x20]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v30.4s, v17.4s\n"
+      "zip1 v17.4s, v29.4s, v21.4s\n"
+      "zip1 v16.4s, v28.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x30]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v28.4s, v17.4s\n"
+      "zip2 v17.4s, v29.4s, v21.4s\n"
+      "zip1 v16.4s, v27.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x40]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v18.4s, v27.4s, v17.4s\n"
+      "zip1 v17.4s, v26.4s, v20.4s\n"
+      "zip1 v16.4s, v25.4s, v17.4s\n"
+      ".inst 0x4ea16a53  // bfcvtn2 v19.8h, v18.4s\n"
+      "str q19, [x27, #0x50]\n"
+      ".inst 0x0ea16a13  // bfcvtn v19.4h, v16.4s\n"
+      "zip2 v17.4s, v25.4s, v17.4s\n"
+      "zip2 v18.4s, v26.4s, v20.4s\n"
+      "zip1 v16.4s, v24.4s, v18.4s\n"
+      ".inst 0x4ea16a33  // bfcvtn2 v19.8h, v17.4s\n"
+      "str q19, [x27, #0x60]\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v24.4s, v18.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q17, [x27, #0x70]\n"
+      "add x27, x27, #0x80\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 16 loop: skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr q19, [x28], #0x10\n"
+      "sub x19, x19, #0x4\n"
+      "ldr q18, [x26], #0x10\n"
+      "cmp x19, #0x4\n"
+      "ldr q17, [x25], #0x10\n"
+      "zip1 v21.4s, v19.4s, v17.4s\n"
+      "ldr q16, [x24], #0x10\n"
+      "zip2 v20.4s, v19.4s, v17.4s\n"
+      "zip1 v17.4s, v18.4s, v16.4s\n"
+      "zip2 v19.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v21.4s, v17.4s\n"
+      ".inst 0x0ea16a12  // bfcvtn v18.4h, v16.4s\n"
+      "zip2 v17.4s, v21.4s, v17.4s\n"
+      "zip1 v16.4s, v20.4s, v19.4s\n"
+      ".inst 0x4ea16a32  // bfcvtn2 v18.8h, v17.4s\n"
+      "str q18, [x27, #0x0]\n"
+      ".inst 0x0ea16a11  // bfcvtn v17.4h, v16.4s\n"
+      "zip2 v16.4s, v20.4s, v19.4s\n"
+      ".inst 0x4ea16a11  // bfcvtn2 v17.8h, v16.4s\n"
+      "str q17, [x27, #0x10]\n"
+      "add x27, x27, #0x20\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr s17, [x28], #0x4\n"
+      "sub x19, x19, #0x1\n"
+      "ldr s18, [x26], #0x4\n"
+      "cmp x19, #0x1\n"
+      "ldr s16, [x25], #0x4\n"
+      "zip1 v17.4s, v17.4s, v16.4s\n"
+      "ldr s16, [x24], #0x4\n"
+      "zip1 v16.4s, v18.4s, v16.4s\n"
+      "zip1 v16.4s, v17.4s, v16.4s\n"
+      ".inst 0x0ea16a10  // bfcvtn v16.4h, v16.4s\n"
+      "str d16, [x27, #0x0]\n"
+      "add x27, x27, #0x8\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0xc0\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<24, 4, true, VLType::None>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_24_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
new file mode 100644
index 0000000000..0a628d372e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_bf16fp32.hpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_24_bf16fp32(float *out, const bfloat16 *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 12 * height * sizeof(float);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add x22, x24, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q17, [x24], #0x10\n"
+      "shll v3.4s, v17.4h, #0x10\n"
+      "ldr q16, [x24], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "shll2 v23.4s, v17.8h, #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "cmp x19, #0x18\n"
+      "shll v22.4s, v16.4h, #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "shll2 v2.4s, v16.8h, #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "shll v21.4s, v17.4h, #0x10\n"
+      "ldr q18, [x21], #0x10\n"
+      "shll2 v1.4s, v17.8h, #0x10\n"
+      "ldr q0, [x20], #0x10\n"
+      "shll v31.4s, v19.4h, #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "shll2 v30.4s, v19.8h, #0x10\n"
+      "shll v29.4s, v16.4h, #0x10\n"
+      "ldr q20, [x21], #0x10\n"
+      "shll2 v28.4s, v16.8h, #0x10\n"
+      "ldr q27, [x20], #0x10\n"
+      "shll v19.4s, v18.4h, #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "shll v26.4s, v17.4h, #0x10\n"
+      "shll2 v25.4s, v17.8h, #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "shll2 v18.4s, v18.8h, #0x10\n"
+      "str q3, [x23, #0x0]\n"
+      "shll v17.4s, v20.4h, #0x10\n"
+      "str q23, [x23, #0x10]\n"
+      "shll2 v23.4s, v20.8h, #0x10\n"
+      "str q22, [x23, #0x20]\n"
+      "shll v22.4s, v16.4h, #0x10\n"
+      "str q21, [x23, #0x30]\n"
+      "shll2 v21.4s, v16.8h, #0x10\n"
+      "str q1, [x23, #0x40]\n"
+      "shll v16.4s, v0.4h, #0x10\n"
+      "str q29, [x23, #0x50]\n"
+      "shll2 v20.4s, v0.8h, #0x10\n"
+      "str q19, [x23, #0x60]\n"
+      "shll v19.4s, v27.4h, #0x10\n"
+      "str q18, [x23, #0x70]\n"
+      "shll2 v18.4s, v27.8h, #0x10\n"
+      "str q17, [x23, #0x80]\n"
+      "shll v17.4s, v24.4h, #0x10\n"
+      "str q16, [x23, #0x90]\n"
+      "shll2 v16.4s, v24.8h, #0x10\n"
+      "str q20, [x23, #0xa0]\n"
+      "str q19, [x23, #0xb0]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "str q2, [x23, #0x0]\n"
+      "str q31, [x23, #0x10]\n"
+      "str q30, [x23, #0x20]\n"
+      "str q28, [x23, #0x30]\n"
+      "str q26, [x23, #0x40]\n"
+      "str q25, [x23, #0x50]\n"
+      "str q23, [x23, #0x60]\n"
+      "str q22, [x23, #0x70]\n"
+      "str q21, [x23, #0x80]\n"
+      "str q18, [x23, #0x90]\n"
+      "str q17, [x23, #0xa0]\n"
+      "str q16, [x23, #0xb0]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q17, [x24], #0x10\n"
+      "shll v19.4s, v17.4h, #0x10\n"
+      "ldr d16, [x24], #0x8\n"
+      "sub x19, x19, #0xc\n"
+      "shll2 v27.4s, v17.8h, #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "cmp x19, #0xc\n"
+      "shll v26.4s, v16.4h, #0x10\n"
+      "ldr q16, [x21], #0x10\n"
+      "ldr q25, [x20], #0x10\n"
+      "shll v24.4s, v17.4h, #0x10\n"
+      "shll2 v23.4s, v17.8h, #0x10\n"
+      "ldr d18, [x22], #0x8\n"
+      "shll v22.4s, v16.4h, #0x10\n"
+      "ldr d17, [x21], #0x8\n"
+      "shll2 v21.4s, v16.8h, #0x10\n"
+      "ldr d16, [x20], #0x8\n"
+      "shll v20.4s, v25.4h, #0x10\n"
+      "str q19, [x23, #0x0]\n"
+      "shll v19.4s, v18.4h, #0x10\n"
+      "str q27, [x23, #0x10]\n"
+      "shll2 v18.4s, v25.8h, #0x10\n"
+      "str q26, [x23, #0x20]\n"
+      "shll v17.4s, v17.4h, #0x10\n"
+      "str q24, [x23, #0x30]\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "str q23, [x23, #0x40]\n"
+      "str q19, [x23, #0x50]\n"
+      "str q22, [x23, #0x60]\n"
+      "str q21, [x23, #0x70]\n"
+      "str q17, [x23, #0x80]\n"
+      "str q20, [x23, #0x90]\n"
+      "str q18, [x23, #0xa0]\n"
+      "str q16, [x23, #0xb0]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d16, [x24], #0x8\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "ldr d16, [x22], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "shll v18.4s, v16.4h, #0x10\n"
+      "ldr d16, [x21], #0x8\n"
+      "cmp x19, #0x4\n"
+      "shll v17.4s, v16.4h, #0x10\n"
+      "ldr d16, [x20], #0x8\n"
+      "str q19, [x23, #0x0]\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "str q18, [x23, #0x30]\n"
+      "str q17, [x23, #0x60]\n"
+      "str q16, [x23, #0x90]\n"
+      "add x23, x23, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h16, [x24], #0x2\n"
+      "shll v19.4s, v16.4h, #0x10\n"
+      "ldr h16, [x22], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "shll v18.4s, v16.4h, #0x10\n"
+      "ldr h16, [x21], #0x2\n"
+      "cmp x19, #0x1\n"
+      "shll v17.4s, v16.4h, #0x10\n"
+      "ldr h16, [x20], #0x2\n"
+      "str s19, [x23, #0x0]\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "str s18, [x23, #0x30]\n"
+      "str s17, [x23, #0x60]\n"
+      "str s16, [x23, #0x90]\n"
+      "add x23, x23, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0xc0\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q16, [x24], #0x10\n"
+      "shll v20.4s, v16.4h, #0x10\n"
+      "ldr q18, [x24], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "shll2 v17.4s, v16.8h, #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "shll v16.4s, v18.4h, #0x10\n"
+      "cmp x19, #0x18\n"
+      "shll2 v18.4s, v18.8h, #0x10\n"
+      "str q20, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "shll v17.4s, v19.4h, #0x10\n"
+      "str q16, [x23, #0x20]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "shll2 v16.4s, v19.8h, #0x10\n"
+      "str q18, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q17, [x24], #0x10\n"
+      "shll v18.4s, v17.4h, #0x10\n"
+      "ldr d16, [x24], #0x8\n"
+      "sub x19, x19, #0xc\n"
+      "shll2 v17.4s, v17.8h, #0x10\n"
+      "str q18, [x23, #0x0]\n"
+      "cmp x19, #0xc\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "str q17, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d16, [x24], #0x8\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "str q16, [x23, #0x0]\n"
+      "sub x19, x19, #0x4\n"
+      "add x23, x23, #0x10\n"
+      "cmp x19, #0x4\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h16, [x24], #0x2\n"
+      "shll v16.4s, v16.4h, #0x10\n"
+      "str s16, [x23, #0x0]\n"
+      "sub x19, x19, #0x1\n"
+      "add x23, x23, #0x4\n"
+      "cmp x19, #0x1\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x30\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<12, 1, true, VLType::None>(
+    float *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_24_bf16fp32(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
new file mode 100644
index 0000000000..7bac8173e7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24_fp16fp32.hpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_24_fp16fp32(float *out, const __fp16 *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 12 * height * sizeof(float);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add x22, x24, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q17, [x24], #0x10\n"
+      "fcvtl v3.4s, v17.4h\n"
+      "ldr q16, [x24], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "fcvtl2 v23.4s, v17.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "cmp x19, #0x18\n"
+      "fcvtl v22.4s, v16.4h\n"
+      "ldr q19, [x24], #0x10\n"
+      "fcvtl2 v2.4s, v16.8h\n"
+      "ldr q16, [x22], #0x10\n"
+      "fcvtl v21.4s, v17.4h\n"
+      "ldr q18, [x21], #0x10\n"
+      "fcvtl2 v1.4s, v17.8h\n"
+      "ldr q0, [x20], #0x10\n"
+      "fcvtl v31.4s, v19.4h\n"
+      "ldr q17, [x22], #0x10\n"
+      "fcvtl2 v30.4s, v19.8h\n"
+      "fcvtl v29.4s, v16.4h\n"
+      "ldr q20, [x21], #0x10\n"
+      "fcvtl2 v28.4s, v16.8h\n"
+      "ldr q27, [x20], #0x10\n"
+      "fcvtl v19.4s, v18.4h\n"
+      "ldr q16, [x21], #0x10\n"
+      "fcvtl v26.4s, v17.4h\n"
+      "fcvtl2 v25.4s, v17.8h\n"
+      "ldr q24, [x20], #0x10\n"
+      "fcvtl2 v18.4s, v18.8h\n"
+      "str q3, [x23, #0x0]\n"
+      "fcvtl v17.4s, v20.4h\n"
+      "str q23, [x23, #0x10]\n"
+      "fcvtl2 v23.4s, v20.8h\n"
+      "str q22, [x23, #0x20]\n"
+      "fcvtl v22.4s, v16.4h\n"
+      "str q21, [x23, #0x30]\n"
+      "fcvtl2 v21.4s, v16.8h\n"
+      "str q1, [x23, #0x40]\n"
+      "fcvtl v16.4s, v0.4h\n"
+      "str q29, [x23, #0x50]\n"
+      "fcvtl2 v20.4s, v0.8h\n"
+      "str q19, [x23, #0x60]\n"
+      "fcvtl v19.4s, v27.4h\n"
+      "str q18, [x23, #0x70]\n"
+      "fcvtl2 v18.4s, v27.8h\n"
+      "str q17, [x23, #0x80]\n"
+      "fcvtl v17.4s, v24.4h\n"
+      "str q16, [x23, #0x90]\n"
+      "fcvtl2 v16.4s, v24.8h\n"
+      "str q20, [x23, #0xa0]\n"
+      "str q19, [x23, #0xb0]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "str q2, [x23, #0x0]\n"
+      "str q31, [x23, #0x10]\n"
+      "str q30, [x23, #0x20]\n"
+      "str q28, [x23, #0x30]\n"
+      "str q26, [x23, #0x40]\n"
+      "str q25, [x23, #0x50]\n"
+      "str q23, [x23, #0x60]\n"
+      "str q22, [x23, #0x70]\n"
+      "str q21, [x23, #0x80]\n"
+      "str q18, [x23, #0x90]\n"
+      "str q17, [x23, #0xa0]\n"
+      "str q16, [x23, #0xb0]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q17, [x24], #0x10\n"
+      "fcvtl v19.4s, v17.4h\n"
+      "ldr d16, [x24], #0x8\n"
+      "sub x19, x19, #0xc\n"
+      "fcvtl2 v27.4s, v17.8h\n"
+      "ldr q17, [x22], #0x10\n"
+      "cmp x19, #0xc\n"
+      "fcvtl v26.4s, v16.4h\n"
+      "ldr q16, [x21], #0x10\n"
+      "ldr q25, [x20], #0x10\n"
+      "fcvtl v24.4s, v17.4h\n"
+      "fcvtl2 v23.4s, v17.8h\n"
+      "ldr d18, [x22], #0x8\n"
+      "fcvtl v22.4s, v16.4h\n"
+      "ldr d17, [x21], #0x8\n"
+      "fcvtl2 v21.4s, v16.8h\n"
+      "ldr d16, [x20], #0x8\n"
+      "fcvtl v20.4s, v25.4h\n"
+      "str q19, [x23, #0x0]\n"
+      "fcvtl v19.4s, v18.4h\n"
+      "str q27, [x23, #0x10]\n"
+      "fcvtl2 v18.4s, v25.8h\n"
+      "str q26, [x23, #0x20]\n"
+      "fcvtl v17.4s, v17.4h\n"
+      "str q24, [x23, #0x30]\n"
+      "fcvtl v16.4s, v16.4h\n"
+      "str q23, [x23, #0x40]\n"
+      "str q19, [x23, #0x50]\n"
+      "str q22, [x23, #0x60]\n"
+      "str q21, [x23, #0x70]\n"
+      "str q17, [x23, #0x80]\n"
+      "str q20, [x23, #0x90]\n"
+      "str q18, [x23, #0xa0]\n"
+      "str q16, [x23, #0xb0]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d16, [x24], #0x8\n"
+      "fcvtl v19.4s, v16.4h\n"
+      "ldr d16, [x22], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "fcvtl v18.4s, v16.4h\n"
+      "ldr d16, [x21], #0x8\n"
+      "cmp x19, #0x4\n"
+      "fcvtl v17.4s, v16.4h\n"
+      "ldr d16, [x20], #0x8\n"
+      "str q19, [x23, #0x0]\n"
+      "fcvtl v16.4s, v16.4h\n"
+      "str q18, [x23, #0x30]\n"
+      "str q17, [x23, #0x60]\n"
+      "str q16, [x23, #0x90]\n"
+      "add x23, x23, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h16, [x24], #0x2\n"
+      "fcvtl v19.4s, v16.4h\n"
+      "ldr h16, [x22], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "fcvtl v18.4s, v16.4h\n"
+      "ldr h16, [x21], #0x2\n"
+      "cmp x19, #0x1\n"
+      "fcvtl v17.4s, v16.4h\n"
+      "ldr h16, [x20], #0x2\n"
+      "str s19, [x23, #0x0]\n"
+      "fcvtl v16.4s, v16.4h\n"
+      "str s18, [x23, #0x30]\n"
+      "str s17, [x23, #0x60]\n"
+      "str s16, [x23, #0x90]\n"
+      "add x23, x23, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0xc0\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Unroll column loop
+      "ldr q16, [x24], #0x10\n"
+      "fcvtl v20.4s, v16.4h\n"
+      "ldr q18, [x24], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "fcvtl2 v17.4s, v16.8h\n"
+      "ldr q19, [x24], #0x10\n"
+      "fcvtl v16.4s, v18.4h\n"
+      "cmp x19, #0x18\n"
+      "fcvtl2 v18.4s, v18.8h\n"
+      "str q20, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "fcvtl v17.4s, v19.4h\n"
+      "str q16, [x23, #0x20]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "fcvtl2 v16.4s, v19.8h\n"
+      "str q18, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Unroll column loop skip
+      "cmp x19, #0xc\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Column loop
+      "ldr q17, [x24], #0x10\n"
+      "fcvtl v18.4s, v17.4h\n"
+      "ldr d16, [x24], #0x8\n"
+      "sub x19, x19, #0xc\n"
+      "fcvtl2 v17.4s, v17.8h\n"
+      "str q18, [x23, #0x0]\n"
+      "fcvtl v16.4s, v16.4h\n"
+      "str q17, [x23, #0x10]\n"
+      "cmp x19, #0xc\n"
+      "str q16, [x23, #0x20]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d16, [x24], #0x8\n"
+      "fcvtl v16.4s, v16.4h\n"
+      "str q16, [x23, #0x0]\n"
+      "sub x19, x19, #0x4\n"
+      "add x23, x23, #0x10\n"
+      "cmp x19, #0x4\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h16, [x24], #0x2\n"
+      "fcvtl v16.4s, v16.4h\n"
+      "str s16, [x23, #0x0]\n"
+      "sub x19, x19, #0x1\n"
+      "add x23, x23, #0x4\n"
+      "cmp x19, #0x1\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x30\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<12, 1, true, VLType::None>(
+    float *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_24_fp16fp32(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
deleted file mode 100644
index bcbe2b84d8..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "transpose_interleave_common.hpp"
-
-// Generic unblocked transposed 12x32-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<12, 1, true, 4, 4, VLType::None>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a 24 x uint16_t specialisation
-  TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t * const>(in),
-    stride*2, x0*2, xmax*2, k0, kmax
-  );
-}
-
-// Generic 24x16-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a uint16_t specialisation
-  Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t * const>(in),
-    stride, x0, xmax, k0, kmax
-  );
-}
-
-// Specialised 24 x uint16_t version
-template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
-    __asm __volatile (
-        "LDP    q0, q1, [%[in0]], #32\n"
-        "STP    q0, q1, [%[out]]\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDR    q2, [%[in0]], #16\n"
-        "STR    q2, [%[out], #32]\n"
-    : [in0] "+r" (in0), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "memory"
-    );
-}
-
-template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1,uint16_t *out) {
-    __asm __volatile (
-        "LDP    q0, q1, [%[in0]], #32\n"
-        "STP    q0, q1, [%[out]]\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDR    q2, [%[in0]], #16\n"
-        "LDP	q3, q4, [%[in1]], #32\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        ASM_PREFETCH("[%[in1], #192]")
-        "LDR	q5, [%[in1]], #16\n"
-        "STP    q4, q5, [%[out], #64]\n"
-    : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "v3", "v4", "v5", "memory"
-    );
-}
-
-template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
-    __asm __volatile (
-        "LDP    q0, q1, [%[in0]], #32\n"
-        "STP    q0, q1, [%[out]]\n"
-        "LDR    q2, [%[in0]], #16\n"
-        ASM_PREFETCH("[%[in0], #192]")
-        "LDP	q3, q4, [%[in1]], #32\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        "LDR	q5, [%[in1]], #16\n"
-        ASM_PREFETCH("[%[in1], #192]")
-        "STP    q4, q5, [%[out], #64]\n"
-        "LDP	q6, q7, [%[in2]], #32\n"
-        "STP    q6, q7, [%[out], #96]\n"
-        "LDR	q8, [%[in2]], #16\n"
-        ASM_PREFETCH("[%[in2], #192]")
-        "LDP	q9, q10, [%[in3]], #32\n"
-        "STP    q8, q9, [%[out], #128]\n"
-        "LDR	q11, [%[in3]], #16\n"
-        "STP    q10, q11, [%[out], #160]\n"
-        ASM_PREFETCH("[%[in3], #192]")
-
-    : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out)
-    :
-    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
-    );
-}
-
-template <>
-template <>
-inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
-    uint16_t* out, const uint16_t* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif  // __arch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
new file mode 100644
index 0000000000..912d512643
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_1x4.hpp
@@ -0,0 +1,508 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_32_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 32 * roundup<size_t>(height, 4) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x10\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x16, %x[in]\n"
+      "mov x15, %x[out]\n"
+      "add x14, x16, %x[in_stride]\n"
+      "add x13, x14, %x[in_stride]\n"
+      "add x12, x13, %x[in_stride]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x10\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x20\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q14, [x16], #0x10\n"
+      "sub x19, x19, #0x20\n"
+      "ldr q7, [x14], #0x10\n"
+      "cmp x19, #0x20\n"
+      "ldr q1, [x13], #0x10\n"
+      "zip1 v0.16b, v14.16b, v1.16b\n"
+      "ldr q2, [x16], #0x10\n"
+      "zip2 v1.16b, v14.16b, v1.16b\n"
+      "ldr q26, [x14], #0x10\n"
+      "ldr q19, [x13], #0x10\n"
+      "zip1 v20.16b, v2.16b, v19.16b\n"
+      "ldr q15, [x12], #0x10\n"
+      "zip2 v14.16b, v2.16b, v19.16b\n"
+      "ldr q30, [x11], #0x10\n"
+      "ldr q29, [x10], #0x10\n"
+      "zip1 v8.16b, v7.16b, v15.16b\n"
+      "ldr q31, [x12], #0x10\n"
+      "zip2 v22.16b, v7.16b, v15.16b\n"
+      "ldr q28, [x11], #0x10\n"
+      "zip1 v25.16b, v0.16b, v8.16b\n"
+      "ldr q23, [x10], #0x10\n"
+      "zip2 v10.16b, v0.16b, v8.16b\n"
+      "ldr q27, [x9], #0x10\n"
+      "zip1 v4.16b, v1.16b, v22.16b\n"
+      "ldr q0, [x28], #0x10\n"
+      "zip2 v5.16b, v1.16b, v22.16b\n"
+      "ldr q13, [x27], #0x10\n"
+      "zip1 v12.16b, v26.16b, v31.16b\n"
+      "ldr q17, [x26], #0x10\n"
+      "zip1 v24.16b, v20.16b, v12.16b\n"
+      "ldr q18, [x9], #0x10\n"
+      "zip2 v12.16b, v20.16b, v12.16b\n"
+      "ldr q6, [x28], #0x10\n"
+      "zip2 v16.16b, v26.16b, v31.16b\n"
+      "ldr q15, [x27], #0x10\n"
+      "zip1 v22.16b, v14.16b, v16.16b\n"
+      "ldr q1, [x26], #0x10\n"
+      "zip2 v9.16b, v14.16b, v16.16b\n"
+      "ldr q8, [x25], #0x10\n"
+      "zip1 v26.16b, v30.16b, v27.16b\n"
+      "ldr q19, [x24], #0x10\n"
+      "zip1 v16.16b, v29.16b, v0.16b\n"
+      "ldr q7, [x23], #0x10\n"
+      "zip1 v11.16b, v26.16b, v16.16b\n"
+      "ldr q14, [x22], #0x10\n"
+      "zip2 v20.16b, v26.16b, v16.16b\n"
+      "ldr q16, [x25], #0x10\n"
+      "zip2 v26.16b, v30.16b, v27.16b\n"
+      "ldr q31, [x24], #0x10\n"
+      "zip2 v21.16b, v29.16b, v0.16b\n"
+      "ldr q0, [x23], #0x10\n"
+      "zip1 v30.16b, v26.16b, v21.16b\n"
+      "ldr q29, [x22], #0x10\n"
+      "zip2 v27.16b, v26.16b, v21.16b\n"
+      "ldr q3, [x21], #0x10\n"
+      "zip1 v21.16b, v28.16b, v18.16b\n"
+      "ldr q2, [x20], #0x10\n"
+      "zip1 v26.16b, v23.16b, v6.16b\n"
+      "zip2 v18.16b, v28.16b, v18.16b\n"
+      "ldr q28, [x21], #0x10\n"
+      "zip2 v23.16b, v23.16b, v6.16b\n"
+      "zip1 v6.16b, v21.16b, v26.16b\n"
+      "zip2 v21.16b, v21.16b, v26.16b\n"
+      "ldr q26, [x20], #0x10\n"
+      "str q25, [x15, #0x0]\n"
+      "zip1 v25.16b, v18.16b, v23.16b\n"
+      "zip2 v23.16b, v18.16b, v23.16b\n"
+      "str q10, [x15, #0x10]\n"
+      "zip1 v18.16b, v13.16b, v8.16b\n"
+      "str q4, [x15, #0x20]\n"
+      "zip1 v10.16b, v17.16b, v19.16b\n"
+      "str q5, [x15, #0x30]\n"
+      "zip1 v5.16b, v18.16b, v10.16b\n"
+      "str q24, [x15, #0x40]\n"
+      "zip2 v24.16b, v18.16b, v10.16b\n"
+      "str q12, [x15, #0x50]\n"
+      "zip2 v18.16b, v13.16b, v8.16b\n"
+      "str q22, [x15, #0x60]\n"
+      "zip2 v17.16b, v17.16b, v19.16b\n"
+      "str q9, [x15, #0x70]\n"
+      "zip1 v9.16b, v18.16b, v17.16b\n"
+      "str q11, [x15, #0x80]\n"
+      "zip2 v12.16b, v18.16b, v17.16b\n"
+      "str q20, [x15, #0x90]\n"
+      "zip1 v20.16b, v15.16b, v16.16b\n"
+      "str q30, [x15, #0xa0]\n"
+      "zip1 v17.16b, v1.16b, v31.16b\n"
+      "str q27, [x15, #0xb0]\n"
+      "zip1 v19.16b, v20.16b, v17.16b\n"
+      "str q6, [x15, #0xc0]\n"
+      "zip2 v18.16b, v20.16b, v17.16b\n"
+      "str q21, [x15, #0xd0]\n"
+      "zip2 v17.16b, v15.16b, v16.16b\n"
+      "str q25, [x15, #0xe0]\n"
+      "zip2 v16.16b, v1.16b, v31.16b\n"
+      "str q23, [x15, #0xf0]\n"
+      "zip1 v22.16b, v17.16b, v16.16b\n"
+      "str q5, [x15, #0x100]\n"
+      "zip2 v21.16b, v17.16b, v16.16b\n"
+      "str q24, [x15, #0x110]\n"
+      "zip1 v17.16b, v7.16b, v3.16b\n"
+      "str q9, [x15, #0x120]\n"
+      "zip1 v16.16b, v14.16b, v2.16b\n"
+      "str q12, [x15, #0x130]\n"
+      "zip1 v20.16b, v17.16b, v16.16b\n"
+      "str q19, [x15, #0x140]\n"
+      "zip2 v19.16b, v17.16b, v16.16b\n"
+      "str q18, [x15, #0x150]\n"
+      "zip2 v18.16b, v7.16b, v3.16b\n"
+      "str q22, [x15, #0x160]\n"
+      "zip2 v16.16b, v14.16b, v2.16b\n"
+      "str q21, [x15, #0x170]\n"
+      "zip1 v17.16b, v18.16b, v16.16b\n"
+      "str q20, [x15, #0x180]\n"
+      "zip2 v16.16b, v18.16b, v16.16b\n"
+      "str q19, [x15, #0x190]\n"
+      "zip1 v18.16b, v0.16b, v28.16b\n"
+      "str q17, [x15, #0x1a0]\n"
+      "zip1 v17.16b, v29.16b, v26.16b\n"
+      "str q16, [x15, #0x1b0]\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x1c0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x1d0]\n"
+      "zip2 v18.16b, v0.16b, v28.16b\n"
+      "zip2 v17.16b, v29.16b, v26.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x1e0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x1f0]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x19, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 16 loop: loop
+      "ldr q18, [x16], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q20, [x14], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q17, [x13], #0x10\n"
+      "zip1 v19.16b, v18.16b, v17.16b\n"
+      "ldr q16, [x12], #0x10\n"
+      "zip2 v18.16b, v18.16b, v17.16b\n"
+      "ldr q3, [x11], #0x10\n"
+      "ldr q2, [x10], #0x10\n"
+      "zip1 v17.16b, v20.16b, v16.16b\n"
+      "ldr q1, [x9], #0x10\n"
+      "zip2 v16.16b, v20.16b, v16.16b\n"
+      "ldr q0, [x28], #0x10\n"
+      "zip1 v31.16b, v19.16b, v17.16b\n"
+      "ldr q30, [x27], #0x10\n"
+      "zip2 v20.16b, v19.16b, v17.16b\n"
+      "ldr q29, [x26], #0x10\n"
+      "zip1 v19.16b, v18.16b, v16.16b\n"
+      "ldr q28, [x25], #0x10\n"
+      "zip2 v18.16b, v18.16b, v16.16b\n"
+      "ldr q27, [x24], #0x10\n"
+      "zip1 v17.16b, v3.16b, v1.16b\n"
+      "ldr q26, [x23], #0x10\n"
+      "zip1 v16.16b, v2.16b, v0.16b\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip1 v24.16b, v17.16b, v16.16b\n"
+      "ldr q23, [x21], #0x10\n"
+      "zip2 v22.16b, v17.16b, v16.16b\n"
+      "ldr q21, [x20], #0x10\n"
+      "zip2 v17.16b, v3.16b, v1.16b\n"
+      "str q31, [x15, #0x0]\n"
+      "zip2 v16.16b, v2.16b, v0.16b\n"
+      "str q20, [x15, #0x10]\n"
+      "zip1 v20.16b, v17.16b, v16.16b\n"
+      "str q19, [x15, #0x20]\n"
+      "zip2 v19.16b, v17.16b, v16.16b\n"
+      "str q18, [x15, #0x30]\n"
+      "zip1 v18.16b, v30.16b, v28.16b\n"
+      "str q24, [x15, #0x80]\n"
+      "zip1 v16.16b, v29.16b, v27.16b\n"
+      "str q22, [x15, #0x90]\n"
+      "zip1 v17.16b, v18.16b, v16.16b\n"
+      "str q20, [x15, #0xa0]\n"
+      "zip2 v16.16b, v18.16b, v16.16b\n"
+      "str q19, [x15, #0xb0]\n"
+      "zip2 v18.16b, v30.16b, v28.16b\n"
+      "str q17, [x15, #0x100]\n"
+      "zip2 v17.16b, v29.16b, v27.16b\n"
+      "str q16, [x15, #0x110]\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x120]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x130]\n"
+      "zip1 v18.16b, v26.16b, v23.16b\n"
+      "zip1 v17.16b, v25.16b, v21.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x180]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x190]\n"
+      "zip2 v18.16b, v26.16b, v23.16b\n"
+      "zip2 v17.16b, v25.16b, v21.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x1a0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x1b0]\n"
+      "add x15, x15, #0x40\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 16 loop: skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr s18, [x16], #0x4\n"
+      "sub x19, x19, #0x4\n"
+      "ldr s17, [x14], #0x4\n"
+      "cmp x19, #0x4\n"
+      "ldr s16, [x13], #0x4\n"
+      "zip1 v19.16b, v18.16b, v16.16b\n"
+      "ldr s16, [x12], #0x4\n"
+      "ldr s18, [x11], #0x4\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "ldr s20, [x10], #0x4\n"
+      "ldr s17, [x9], #0x4\n"
+      "zip1 v23.16b, v19.16b, v16.16b\n"
+      "ldr s16, [x28], #0x4\n"
+      "zip1 v19.16b, v18.16b, v17.16b\n"
+      "ldr s18, [x27], #0x4\n"
+      "ldr s22, [x26], #0x4\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "ldr s17, [x25], #0x4\n"
+      "zip1 v21.16b, v19.16b, v16.16b\n"
+      "ldr s16, [x24], #0x4\n"
+      "zip1 v18.16b, v18.16b, v17.16b\n"
+      "ldr s20, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "zip1 v16.16b, v22.16b, v16.16b\n"
+      "ldr s17, [x21], #0x4\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "str q23, [x15, #0x0]\n"
+      "str q21, [x15, #0x80]\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "str q18, [x15, #0x100]\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x15, #0x180]\n"
+      "add x15, x15, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr b18, [x16], #0x1\n"
+      "sub x19, x19, #0x1\n"
+      "ldr b17, [x14], #0x1\n"
+      "cmp x19, #0x1\n"
+      "ldr b16, [x13], #0x1\n"
+      "zip1 v19.16b, v18.16b, v16.16b\n"
+      "ldr b16, [x12], #0x1\n"
+      "ldr b18, [x11], #0x1\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "ldr b20, [x10], #0x1\n"
+      "ldr b17, [x9], #0x1\n"
+      "zip1 v23.16b, v19.16b, v16.16b\n"
+      "ldr b16, [x28], #0x1\n"
+      "zip1 v19.16b, v18.16b, v17.16b\n"
+      "ldr b18, [x27], #0x1\n"
+      "ldr b22, [x26], #0x1\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "ldr b17, [x25], #0x1\n"
+      "zip1 v21.16b, v19.16b, v16.16b\n"
+      "ldr b16, [x24], #0x1\n"
+      "zip1 v18.16b, v18.16b, v17.16b\n"
+      "ldr b20, [x23], #0x1\n"
+      "ldr b19, [x22], #0x1\n"
+      "zip1 v16.16b, v22.16b, v16.16b\n"
+      "ldr b17, [x21], #0x1\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr b16, [x20], #0x1\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "str s23, [x15, #0x0]\n"
+      "str s21, [x15, #0x80]\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "str s18, [x15, #0x100]\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s16, [x15, #0x180]\n"
+      "add x15, x15, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x200\n"
+      "cmp %x[height], #0x10\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x16, %x[in]\n"
+      "mov x15, %x[out]\n"
+      "add x14, x16, %x[in_stride]\n"
+      "add x13, x14, %x[in_stride]\n"
+      "add x12, x13, %x[in_stride]\n"
+      "add %x[in], x12, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x12, x12, %x[pad_row], GT\n"
+      "csel x13, x13, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x14, x14, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x20\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q17, [x16], #0x10\n"
+      "sub x19, x19, #0x20\n"
+      "ldr q25, [x14], #0x10\n"
+      "cmp x19, #0x20\n"
+      "ldr q16, [x13], #0x10\n"
+      "zip1 v24.16b, v17.16b, v16.16b\n"
+      "ldr q18, [x16], #0x10\n"
+      "zip2 v23.16b, v17.16b, v16.16b\n"
+      "ldr q22, [x14], #0x10\n"
+      "ldr q17, [x13], #0x10\n"
+      "zip1 v21.16b, v18.16b, v17.16b\n"
+      "ldr q16, [x12], #0x10\n"
+      "zip2 v20.16b, v18.16b, v17.16b\n"
+      "ldr q19, [x12], #0x10\n"
+      "zip1 v18.16b, v25.16b, v16.16b\n"
+      "zip2 v17.16b, v25.16b, v16.16b\n"
+      "zip1 v16.16b, v24.16b, v18.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "zip2 v16.16b, v24.16b, v18.16b\n"
+      "str q16, [x15, #0x10]\n"
+      "zip1 v16.16b, v23.16b, v17.16b\n"
+      "str q16, [x15, #0x20]\n"
+      "zip2 v16.16b, v23.16b, v17.16b\n"
+      "str q16, [x15, #0x30]\n"
+      "zip1 v17.16b, v22.16b, v19.16b\n"
+      "zip1 v16.16b, v21.16b, v17.16b\n"
+      "str q16, [x15, #0x40]\n"
+      "zip2 v16.16b, v21.16b, v17.16b\n"
+      "str q16, [x15, #0x50]\n"
+      "zip2 v17.16b, v22.16b, v19.16b\n"
+      "zip1 v16.16b, v20.16b, v17.16b\n"
+      "str q16, [x15, #0x60]\n"
+      "zip2 v16.16b, v20.16b, v17.16b\n"
+      "str q16, [x15, #0x70]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 16 loop: loop
+      "ldr q18, [x16], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q21, [x14], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q17, [x13], #0x10\n"
+      "zip1 v20.16b, v18.16b, v17.16b\n"
+      "ldr q16, [x12], #0x10\n"
+      "zip2 v19.16b, v18.16b, v17.16b\n"
+      "zip1 v18.16b, v21.16b, v16.16b\n"
+      "zip2 v17.16b, v21.16b, v16.16b\n"
+      "zip1 v16.16b, v20.16b, v18.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "zip2 v16.16b, v20.16b, v18.16b\n"
+      "str q16, [x15, #0x10]\n"
+      "zip1 v16.16b, v19.16b, v17.16b\n"
+      "str q16, [x15, #0x20]\n"
+      "zip2 v16.16b, v19.16b, v17.16b\n"
+      "str q16, [x15, #0x30]\n"
+      "add x15, x15, #0x40\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 16 loop: skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr s17, [x16], #0x4\n"
+      "sub x19, x19, #0x4\n"
+      "ldr s18, [x14], #0x4\n"
+      "cmp x19, #0x4\n"
+      "ldr s16, [x13], #0x4\n"
+      "zip1 v17.16b, v17.16b, v16.16b\n"
+      "ldr s16, [x12], #0x4\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "add x15, x15, #0x10\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr b17, [x16], #0x1\n"
+      "sub x19, x19, #0x1\n"
+      "ldr b18, [x14], #0x1\n"
+      "cmp x19, #0x1\n"
+      "ldr b16, [x13], #0x1\n"
+      "zip1 v17.16b, v17.16b, v16.16b\n"
+      "ldr b16, [x12], #0x1\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s16, [x15, #0x0]\n"
+      "add x15, x15, #0x4\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x80\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<32, 4, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_32_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<32, 4, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_32_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
new file mode 100644
index 0000000000..05e68daba1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_32_2x2.hpp
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_32_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 32 * roundup<size_t>(height, 2) * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 12f\n"
+      "1:"  // Main row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[width]\n"
+      "add x22, x24, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x19, x21, %x[in_stride]\n"
+      "cmp x23, #0x40\n"
+      "add %x[in], x19, %x[in_stride]\n"
+      "mov x20, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q14, [x24], #0x10\n"
+      "ldr q10, [x22], #0x10\n"
+      "sub x23, x23, #0x40\n"
+      "zip1 v12.8h, v14.8h, v10.8h\n"
+      "ldr q5, [x21], #0x10\n"
+      "ldr q3, [x19], #0x10\n"
+      "zip2 v31.8h, v14.8h, v10.8h\n"
+      "zip1 v19.8h, v5.8h, v3.8h\n"
+      "ldr q27, [x24], #0x10\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip1 v11.8h, v27.8h, v25.8h\n"
+      "zip2 v24.8h, v27.8h, v25.8h\n"
+      "ldr q6, [x21], #0x10\n"
+      "ldr q29, [x19], #0x10\n"
+      "zip2 v15.8h, v5.8h, v3.8h\n"
+      "zip1 v18.8h, v6.8h, v29.8h\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q9, [x22], #0x10\n"
+      "zip1 v0.8h, v17.8h, v9.8h\n"
+      "zip2 v9.8h, v17.8h, v9.8h\n"
+      "ldr q21, [x21], #0x10\n"
+      "ldr q20, [x19], #0x10\n"
+      "zip2 v8.8h, v6.8h, v29.8h\n"
+      "zip1 v30.8h, v21.8h, v20.8h\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q5, [x22], #0x10\n"
+      "zip1 v13.8h, v17.8h, v5.8h\n"
+      "zip2 v25.8h, v17.8h, v5.8h\n"
+      "ldr q7, [x21], #0x10\n"
+      "ldr q29, [x19], #0x10\n"
+      "zip2 v27.8h, v21.8h, v20.8h\n"
+      "zip1 v14.8h, v7.8h, v29.8h\n"
+      "ldr q28, [x24], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "zip2 v1.8h, v7.8h, v29.8h\n"
+      "cmp x23, #0x40\n"
+      "ldr q10, [x21], #0x10\n"
+      "ldr q21, [x19], #0x10\n"
+      "zip1 v16.8h, v28.8h, v17.8h\n"
+      "zip2 v17.8h, v28.8h, v17.8h\n"
+      "ldr q5, [x24], #0x10\n"
+      "ldr q20, [x22], #0x10\n"
+      "zip1 v3.8h, v5.8h, v20.8h\n"
+      "zip2 v7.8h, v5.8h, v20.8h\n"
+      "ldr q22, [x21], #0x10\n"
+      "ldr q29, [x19], #0x10\n"
+      "zip1 v2.8h, v10.8h, v21.8h\n"
+      "zip2 v5.8h, v10.8h, v21.8h\n"
+      "ldr q21, [x24], #0x10\n"
+      "ldr q20, [x22], #0x10\n"
+      "zip1 v4.8h, v21.8h, v20.8h\n"
+      "zip2 v28.8h, v21.8h, v20.8h\n"
+      "ldr q6, [x21], #0x10\n"
+      "ldr q10, [x19], #0x10\n"
+      "zip1 v26.8h, v22.8h, v29.8h\n"
+      "zip2 v20.8h, v22.8h, v29.8h\n"
+      "ldr q29, [x24], #0x10\n"
+      "ldr q23, [x22], #0x10\n"
+      "zip1 v21.8h, v29.8h, v23.8h\n"
+      "zip2 v23.8h, v29.8h, v23.8h\n"
+      "ldr q22, [x21], #0x10\n"
+      "ldr q29, [x19], #0x10\n"
+      "str q12, [x20, #0x0]\n"
+      "zip1 v12.8h, v6.8h, v10.8h\n"
+      "str q31, [x20, #0x10]\n"
+      "zip2 v6.8h, v6.8h, v10.8h\n"
+      "zip1 v31.8h, v22.8h, v29.8h\n"
+      "str q11, [x20, #0x20]\n"
+      "zip2 v11.8h, v22.8h, v29.8h\n"
+      "str q24, [x20, #0x30]\n"
+      "str q0, [x20, #0x40]\n"
+      "str q9, [x20, #0x50]\n"
+      "str q13, [x20, #0x60]\n"
+      "str q25, [x20, #0x70]\n"
+      "str q19, [x20, #0x80]\n"
+      "str q15, [x20, #0x90]\n"
+      "str q18, [x20, #0xa0]\n"
+      "str q8, [x20, #0xb0]\n"
+      "str q30, [x20, #0xc0]\n"
+      "str q27, [x20, #0xd0]\n"
+      "str q14, [x20, #0xe0]\n"
+      "str q1, [x20, #0xf0]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "str q16, [x20, #0x0]\n"
+      "str q17, [x20, #0x10]\n"
+      "str q3, [x20, #0x20]\n"
+      "str q7, [x20, #0x30]\n"
+      "str q4, [x20, #0x40]\n"
+      "str q28, [x20, #0x50]\n"
+      "str q21, [x20, #0x60]\n"
+      "str q23, [x20, #0x70]\n"
+      "str q2, [x20, #0x80]\n"
+      "str q5, [x20, #0x90]\n"
+      "str q26, [x20, #0xa0]\n"
+      "str q20, [x20, #0xb0]\n"
+      "str q12, [x20, #0xc0]\n"
+      "str q6, [x20, #0xd0]\n"
+      "str q31, [x20, #0xe0]\n"
+      "str q11, [x20, #0xf0]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x23, #0x20\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "sub x23, x23, #0x20\n"
+      "cmp x23, #0x20\n"
+      "ldr q21, [x21], #0x10\n"
+      "ldr q18, [x19], #0x10\n"
+      "zip1 v1.8h, v17.8h, v16.8h\n"
+      "zip2 v0.8h, v17.8h, v16.8h\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v31.8h, v17.8h, v16.8h\n"
+      "zip2 v30.8h, v17.8h, v16.8h\n"
+      "ldr q20, [x21], #0x10\n"
+      "ldr q19, [x19], #0x10\n"
+      "zip1 v29.8h, v21.8h, v18.8h\n"
+      "zip2 v28.8h, v21.8h, v18.8h\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v27.8h, v17.8h, v16.8h\n"
+      "zip2 v26.8h, v17.8h, v16.8h\n"
+      "ldr q25, [x21], #0x10\n"
+      "ldr q18, [x19], #0x10\n"
+      "zip1 v24.8h, v20.8h, v19.8h\n"
+      "zip2 v23.8h, v20.8h, v19.8h\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v22.8h, v17.8h, v16.8h\n"
+      "zip2 v21.8h, v17.8h, v16.8h\n"
+      "ldr q20, [x21], #0x10\n"
+      "ldr q16, [x19], #0x10\n"
+      "zip1 v19.8h, v25.8h, v18.8h\n"
+      "zip2 v18.8h, v25.8h, v18.8h\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q1, [x20, #0x0]\n"
+      "str q0, [x20, #0x10]\n"
+      "str q31, [x20, #0x20]\n"
+      "str q30, [x20, #0x30]\n"
+      "str q27, [x20, #0x40]\n"
+      "str q26, [x20, #0x50]\n"
+      "str q22, [x20, #0x60]\n"
+      "str q21, [x20, #0x70]\n"
+      "str q29, [x20, #0x80]\n"
+      "str q28, [x20, #0x90]\n"
+      "str q24, [x20, #0xa0]\n"
+      "str q23, [x20, #0xb0]\n"
+      "str q19, [x20, #0xc0]\n"
+      "str q18, [x20, #0xd0]\n"
+      "str q17, [x20, #0xe0]\n"
+      "str q16, [x20, #0xf0]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x23, #0x10\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 16 loop: loop
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "sub x23, x23, #0x10\n"
+      "cmp x23, #0x10\n"
+      "ldr q24, [x21], #0x10\n"
+      "ldr q23, [x19], #0x10\n"
+      "zip1 v19.8h, v17.8h, v16.8h\n"
+      "zip2 v18.8h, v17.8h, v16.8h\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v22.8h, v17.8h, v16.8h\n"
+      "zip2 v21.8h, v17.8h, v16.8h\n"
+      "ldr q20, [x21], #0x10\n"
+      "ldr q16, [x19], #0x10\n"
+      "str q19, [x20, #0x0]\n"
+      "zip1 v19.8h, v24.8h, v23.8h\n"
+      "str q18, [x20, #0x10]\n"
+      "zip2 v18.8h, v24.8h, v23.8h\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "str q22, [x20, #0x20]\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q21, [x20, #0x30]\n"
+      "str q19, [x20, #0x80]\n"
+      "str q18, [x20, #0x90]\n"
+      "str q17, [x20, #0xa0]\n"
+      "str q16, [x20, #0xb0]\n"
+      "add x20, x20, #0x40\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 16 loop: skip
+      "cmp x23, #0x4\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x24], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "sub x23, x23, #0x4\n"
+      "cmp x23, #0x4\n"
+      "ldr d18, [x21], #0x8\n"
+      "ldr d17, [x19], #0x8\n"
+      "zip1 v16.8h, v19.8h, v16.8h\n"
+      "str q16, [x20, #0x0]\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str q16, [x20, #0x80]\n"
+      "add x20, x20, #0x10\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 4 loop: skip
+      "cmp x23, #0x1\n"
+      "blt 11f\n"
+      "10:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x24], #0x2\n"
+      "ldr h16, [x22], #0x2\n"
+      "sub x23, x23, #0x1\n"
+      "cmp x23, #0x1\n"
+      "ldr h18, [x21], #0x2\n"
+      "ldr h17, [x19], #0x2\n"
+      "zip1 v16.8h, v19.8h, v16.8h\n"
+      "str s16, [x20, #0x0]\n"
+      "zip1 v16.8h, v18.8h, v17.8h\n"
+      "str s16, [x20, #0x80]\n"
+      "add x20, x20, #0x4\n"
+      "bge 10b\n"
+      "11:"  // Main row loop: width 1 loop: skip
+      "cmp %x[height], #0x4\n"
+      "add %x[out], %x[out], #0x100\n"
+      "bge 1b\n"
+      "cbz %x[height], 24f\n"
+      "12:"  // Main loop skip
+
+      "13:"  // Tail row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x19, %x[width]\n"
+      "add x22, x24, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "add %x[in], x22, %x[in_stride]\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "cmp x19, #0x40\n"
+      "mov x20, %x[out]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: Unroll column loop
+      "ldr q18, [x24], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "sub x19, x19, #0x40\n"
+      "zip1 v0.8h, v18.8h, v17.8h\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip2 v31.8h, v18.8h, v17.8h\n"
+      "zip1 v30.8h, v19.8h, v16.8h\n"
+      "ldr q18, [x24], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "zip2 v29.8h, v19.8h, v16.8h\n"
+      "zip1 v28.8h, v18.8h, v17.8h\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip2 v27.8h, v18.8h, v17.8h\n"
+      "zip1 v26.8h, v19.8h, v16.8h\n"
+      "ldr q18, [x24], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "zip2 v25.8h, v19.8h, v16.8h\n"
+      "cmp x19, #0x40\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v24.8h, v18.8h, v17.8h\n"
+      "zip2 v23.8h, v18.8h, v17.8h\n"
+      "ldr q18, [x24], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "zip1 v22.8h, v19.8h, v16.8h\n"
+      "zip2 v21.8h, v19.8h, v16.8h\n"
+      "ldr q20, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "str q0, [x20, #0x0]\n"
+      "zip1 v19.8h, v18.8h, v17.8h\n"
+      "str q31, [x20, #0x10]\n"
+      "zip2 v18.8h, v18.8h, v17.8h\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "str q30, [x20, #0x20]\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q29, [x20, #0x30]\n"
+      "str q28, [x20, #0x40]\n"
+      "str q27, [x20, #0x50]\n"
+      "str q26, [x20, #0x60]\n"
+      "str q25, [x20, #0x70]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "str q24, [x20, #0x0]\n"
+      "str q23, [x20, #0x10]\n"
+      "str q22, [x20, #0x20]\n"
+      "str q21, [x20, #0x30]\n"
+      "str q19, [x20, #0x40]\n"
+      "str q18, [x20, #0x50]\n"
+      "str q17, [x20, #0x60]\n"
+      "str q16, [x20, #0x70]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: Unroll column loop skip
+      "cmp x19, #0x20\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: Column loop
+      "ldr q18, [x24], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "sub x19, x19, #0x20\n"
+      "cmp x19, #0x20\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v24.8h, v18.8h, v17.8h\n"
+      "zip2 v23.8h, v18.8h, v17.8h\n"
+      "ldr q18, [x24], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "zip1 v22.8h, v19.8h, v16.8h\n"
+      "zip2 v21.8h, v19.8h, v16.8h\n"
+      "ldr q20, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v19.8h, v18.8h, v17.8h\n"
+      "zip2 v18.8h, v18.8h, v17.8h\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q24, [x20, #0x0]\n"
+      "str q23, [x20, #0x10]\n"
+      "str q22, [x20, #0x20]\n"
+      "str q21, [x20, #0x30]\n"
+      "str q19, [x20, #0x40]\n"
+      "str q18, [x20, #0x50]\n"
+      "str q17, [x20, #0x60]\n"
+      "str q16, [x20, #0x70]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x10\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 16 loop: loop
+      "ldr q18, [x24], #0x10\n"
+      "ldr q17, [x22], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q20, [x24], #0x10\n"
+      "ldr q16, [x22], #0x10\n"
+      "zip1 v19.8h, v18.8h, v17.8h\n"
+      "zip2 v18.8h, v18.8h, v17.8h\n"
+      "zip1 v17.8h, v20.8h, v16.8h\n"
+      "zip2 v16.8h, v20.8h, v16.8h\n"
+      "str q19, [x20, #0x0]\n"
+      "str q18, [x20, #0x10]\n"
+      "str q17, [x20, #0x20]\n"
+      "str q16, [x20, #0x30]\n"
+      "add x20, x20, #0x40\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 16 loop: skip
+      "cmp x19, #0x4\n"
+      "blt 21f\n"
+      "20:"  // Tail row loop: width 4 loop: loop
+      "ldr d17, [x24], #0x8\n"
+      "ldr d16, [x22], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "cmp x19, #0x4\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str q16, [x20, #0x0]\n"
+      "add x20, x20, #0x10\n"
+      "bge 20b\n"
+      "21:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 23f\n"
+      "22:"  // Tail row loop: width 1 loop: loop
+      "ldr h17, [x24], #0x2\n"
+      "ldr h16, [x22], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "cmp x19, #0x1\n"
+      "zip1 v16.8h, v17.8h, v16.8h\n"
+      "str s16, [x20, #0x0]\n"
+      "add x20, x20, #0x4\n"
+      "bge 22b\n"
+      "23:"  // Tail row loop: width 1 loop: skip
+      "cmp %x[height], #0x1\n"
+      "add %x[out], %x[out], #0x80\n"
+      "bge 13b\n"
+      "24:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<32, 2, true, VLType::None>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_32_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
new file mode 100644
index 0000000000..4f7019f564
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_48.hpp
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_48(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 24 * height * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add x22, x24, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q27, [x24], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "ldr q26, [x22], #0x10\n"
+      "cmp x19, #0x18\n"
+      "ldr q25, [x21], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x24], #0x10\n"
+      "ldr q22, [x22], #0x10\n"
+      "ldr q21, [x21], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q27, [x23, #0x0]\n"
+      "str q23, [x23, #0x10]\n"
+      "str q19, [x23, #0x20]\n"
+      "str q26, [x23, #0x30]\n"
+      "str q22, [x23, #0x40]\n"
+      "str q18, [x23, #0x50]\n"
+      "str q25, [x23, #0x60]\n"
+      "str q21, [x23, #0x70]\n"
+      "str q17, [x23, #0x80]\n"
+      "str q24, [x23, #0x90]\n"
+      "str q20, [x23, #0xa0]\n"
+      "str q16, [x23, #0xb0]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x19, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 16 loop: loop
+      "ldr q23, [x24], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q22, [x22], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q21, [x21], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q23, [x23, #0x0]\n"
+      "str q19, [x23, #0x10]\n"
+      "str q22, [x23, #0x30]\n"
+      "str q18, [x23, #0x40]\n"
+      "str q21, [x23, #0x60]\n"
+      "str q17, [x23, #0x70]\n"
+      "str q20, [x23, #0x90]\n"
+      "str q16, [x23, #0xa0]\n"
+      "add x23, x23, #0x20\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 16 loop: skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x24], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "ldr d18, [x22], #0x8\n"
+      "cmp x19, #0x4\n"
+      "ldr d17, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "str d19, [x23, #0x0]\n"
+      "str d18, [x23, #0x30]\n"
+      "str d17, [x23, #0x60]\n"
+      "str d16, [x23, #0x90]\n"
+      "add x23, x23, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x24], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "ldr h18, [x22], #0x2\n"
+      "cmp x19, #0x1\n"
+      "ldr h17, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "str h19, [x23, #0x0]\n"
+      "str h18, [x23, #0x30]\n"
+      "str h17, [x23, #0x60]\n"
+      "str h16, [x23, #0x90]\n"
+      "add x23, x23, #0x2\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0xc0\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q18, [x24], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "cmp x19, #0x18\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x24], #0x10\n"
+      "str q18, [x23, #0x0]\n"
+      "str q17, [x23, #0x10]\n"
+      "str q16, [x23, #0x20]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 16 loop: loop
+      "ldr q17, [x24], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q16, [x24], #0x10\n"
+      "str q17, [x23, #0x0]\n"
+      "str q16, [x23, #0x10]\n"
+      "add x23, x23, #0x20\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 16 loop: skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d16, [x24], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "cmp x19, #0x4\n"
+      "str d16, [x23, #0x0]\n"
+      "add x23, x23, #0x8\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h16, [x24], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "cmp x19, #0x1\n"
+      "str h16, [x23, #0x0]\n"
+      "add x23, x23, #0x2\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x30\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<12, 1, true, VLType::None>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_48(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<24, 1, true, VLType::None>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_48(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<6, 1, true, VLType::None>(
+    double *out, const double *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_48(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(double) / 2,
+        stride * sizeof(double),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
new file mode 100644
index 0000000000..cb20172364
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x16.hpp
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_4_1x16(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 16) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 4 * roundup<size_t>(height, 16) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+
+      "1:"  // Main row loop: Head
+      "mov x16, %x[in]\n"
+      "mov x15, %x[out]\n"
+      "add x14, x16, %x[in_stride]\n"
+      "add x13, x14, %x[in_stride]\n"
+      "add x12, x13, %x[in_stride]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "cmp %x[height], #0xf\n"
+      "csel x20, x20, %x[pad_row], GT\n"
+      "csel x21, x21, %x[pad_row], GE\n"
+      "cmp %x[height], #0xd\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "csel x23, x23, %x[pad_row], GE\n"
+      "cmp %x[height], #0xb\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "cmp %x[height], #0x9\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "csel x27, x27, %x[pad_row], GE\n"
+      "cmp %x[height], #0x7\n"
+      "csel x28, x28, %x[pad_row], GT\n"
+      "csel x9, x9, %x[pad_row], GE\n"
+      "cmp %x[height], #0x5\n"
+      "csel x10, x10, %x[pad_row], GT\n"
+      "csel x11, x11, %x[pad_row], GE\n"
+      "cmp %x[height], #0x3\n"
+      "csel x12, x12, %x[pad_row], GT\n"
+      "csel x13, x13, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x14, x14, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x10\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x10\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q20, [x16], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q19, [x14], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q18, [x13], #0x10\n"
+      "ldr q11, [x12], #0x10\n"
+      "ldr q10, [x11], #0x10\n"
+      "ldr q9, [x10], #0x10\n"
+      "ldr q8, [x9], #0x10\n"
+      "ldr q7, [x28], #0x10\n"
+      "ldr q16, [x27], #0x10\n"
+      "zip1 v6.16b, v20.16b, v16.16b\n"
+      "ldr q17, [x26], #0x10\n"
+      "zip2 v5.16b, v20.16b, v16.16b\n"
+      "ldr q16, [x25], #0x10\n"
+      "ldr q4, [x24], #0x10\n"
+      "zip1 v3.16b, v19.16b, v17.16b\n"
+      "ldr q2, [x23], #0x10\n"
+      "zip2 v1.16b, v19.16b, v17.16b\n"
+      "ldr q0, [x22], #0x10\n"
+      "zip1 v31.16b, v18.16b, v16.16b\n"
+      "ldr q30, [x21], #0x10\n"
+      "zip1 v27.16b, v11.16b, v4.16b\n"
+      "ldr q29, [x20], #0x10\n"
+      "zip2 v28.16b, v18.16b, v16.16b\n"
+      "zip1 v26.16b, v10.16b, v2.16b\n"
+      "zip1 v22.16b, v6.16b, v26.16b\n"
+      "zip1 v25.16b, v8.16b, v30.16b\n"
+      "zip1 v21.16b, v31.16b, v25.16b\n"
+      "zip1 v18.16b, v22.16b, v21.16b\n"
+      "zip1 v24.16b, v9.16b, v0.16b\n"
+      "zip1 v20.16b, v3.16b, v24.16b\n"
+      "zip1 v23.16b, v7.16b, v29.16b\n"
+      "zip1 v19.16b, v27.16b, v23.16b\n"
+      "zip1 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x10]\n"
+      "zip2 v18.16b, v22.16b, v21.16b\n"
+      "zip2 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x30]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "zip2 v22.16b, v6.16b, v26.16b\n"
+      "zip2 v21.16b, v31.16b, v25.16b\n"
+      "zip1 v18.16b, v22.16b, v21.16b\n"
+      "zip2 v20.16b, v3.16b, v24.16b\n"
+      "zip2 v19.16b, v27.16b, v23.16b\n"
+      "zip1 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x10]\n"
+      "zip2 v18.16b, v22.16b, v21.16b\n"
+      "zip2 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x30]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "zip2 v27.16b, v10.16b, v2.16b\n"
+      "zip2 v26.16b, v8.16b, v30.16b\n"
+      "zip1 v22.16b, v5.16b, v27.16b\n"
+      "zip1 v21.16b, v28.16b, v26.16b\n"
+      "zip1 v18.16b, v22.16b, v21.16b\n"
+      "zip2 v25.16b, v9.16b, v0.16b\n"
+      "zip1 v20.16b, v1.16b, v25.16b\n"
+      "zip2 v24.16b, v11.16b, v4.16b\n"
+      "zip2 v23.16b, v7.16b, v29.16b\n"
+      "zip1 v19.16b, v24.16b, v23.16b\n"
+      "zip1 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x10]\n"
+      "zip2 v18.16b, v22.16b, v21.16b\n"
+      "zip2 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x30]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "zip2 v22.16b, v5.16b, v27.16b\n"
+      "zip2 v21.16b, v28.16b, v26.16b\n"
+      "zip1 v18.16b, v22.16b, v21.16b\n"
+      "zip2 v20.16b, v1.16b, v25.16b\n"
+      "zip2 v19.16b, v24.16b, v23.16b\n"
+      "zip1 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x10]\n"
+      "zip2 v18.16b, v22.16b, v21.16b\n"
+      "zip2 v17.16b, v20.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x30]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0x4\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr s17, [x16], #0x4\n"
+      "sub x19, x19, #0x4\n"
+      "ldr s20, [x14], #0x4\n"
+      "cmp x19, #0x4\n"
+      "ldr s18, [x13], #0x4\n"
+      "ldr s19, [x12], #0x4\n"
+      "ldr s27, [x11], #0x4\n"
+      "ldr s22, [x10], #0x4\n"
+      "ldr s26, [x9], #0x4\n"
+      "ldr s25, [x28], #0x4\n"
+      "ldr s16, [x27], #0x4\n"
+      "zip1 v21.16b, v17.16b, v16.16b\n"
+      "ldr s17, [x26], #0x4\n"
+      "ldr s16, [x25], #0x4\n"
+      "zip1 v24.16b, v18.16b, v16.16b\n"
+      "ldr s18, [x24], #0x4\n"
+      "zip1 v20.16b, v20.16b, v17.16b\n"
+      "ldr s17, [x23], #0x4\n"
+      "ldr s16, [x22], #0x4\n"
+      "zip1 v23.16b, v19.16b, v18.16b\n"
+      "ldr s18, [x21], #0x4\n"
+      "ldr s19, [x20], #0x4\n"
+      "zip1 v17.16b, v27.16b, v17.16b\n"
+      "zip1 v16.16b, v22.16b, v16.16b\n"
+      "zip1 v22.16b, v21.16b, v17.16b\n"
+      "zip1 v21.16b, v20.16b, v16.16b\n"
+      "zip1 v16.16b, v26.16b, v18.16b\n"
+      "zip1 v20.16b, v24.16b, v16.16b\n"
+      "zip1 v18.16b, v22.16b, v20.16b\n"
+      "zip1 v16.16b, v25.16b, v19.16b\n"
+      "zip1 v19.16b, v23.16b, v16.16b\n"
+      "zip1 v17.16b, v21.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x10]\n"
+      "zip2 v18.16b, v22.16b, v20.16b\n"
+      "zip2 v17.16b, v21.16b, v19.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x20]\n"
+      "zip2 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x30]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x1\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 1 loop: loop
+      "ldr b17, [x16], #0x1\n"
+      "sub x19, x19, #0x1\n"
+      "ldr b21, [x14], #0x1\n"
+      "cmp x19, #0x1\n"
+      "ldr b18, [x13], #0x1\n"
+      "ldr b20, [x12], #0x1\n"
+      "ldr b27, [x11], #0x1\n"
+      "ldr b26, [x10], #0x1\n"
+      "ldr b25, [x9], #0x1\n"
+      "ldr b24, [x28], #0x1\n"
+      "ldr b16, [x27], #0x1\n"
+      "zip1 v23.16b, v17.16b, v16.16b\n"
+      "ldr b17, [x26], #0x1\n"
+      "ldr b16, [x25], #0x1\n"
+      "zip1 v22.16b, v18.16b, v16.16b\n"
+      "ldr b19, [x24], #0x1\n"
+      "zip1 v18.16b, v21.16b, v17.16b\n"
+      "ldr b17, [x23], #0x1\n"
+      "ldr b16, [x22], #0x1\n"
+      "zip1 v21.16b, v20.16b, v19.16b\n"
+      "ldr b20, [x21], #0x1\n"
+      "ldr b19, [x20], #0x1\n"
+      "zip1 v17.16b, v27.16b, v17.16b\n"
+      "zip1 v16.16b, v26.16b, v16.16b\n"
+      "zip1 v17.16b, v23.16b, v17.16b\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v25.16b, v20.16b\n"
+      "zip1 v16.16b, v22.16b, v16.16b\n"
+      "zip1 v17.16b, v17.16b, v16.16b\n"
+      "zip1 v16.16b, v24.16b, v19.16b\n"
+      "zip1 v16.16b, v21.16b, v16.16b\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "add x15, x15, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x40\n"
+      "cmp %x[height], #0x1\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 16, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_4_1x16(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 16, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_4_1x16(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
new file mode 100644
index 0000000000..27cebe26cf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_4_1x4.hpp
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_4_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 4 * roundup<size_t>(height, 4) * sizeof(uint8_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x10\n"
+      "blt 8f\n"
+      "1:"  // Main row loop: Head
+      "mov x16, %x[in]\n"
+      "mov x15, %x[out]\n"
+      "add x14, x16, %x[in_stride]\n"
+      "add x13, x14, %x[in_stride]\n"
+      "add x12, x13, %x[in_stride]\n"
+      "add x11, x12, %x[in_stride]\n"
+      "add x10, x11, %x[in_stride]\n"
+      "add x9, x10, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x10\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x10\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ldr q20, [x16], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q19, [x14], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q16, [x13], #0x10\n"
+      "zip1 v18.16b, v20.16b, v16.16b\n"
+      "ldr q17, [x12], #0x10\n"
+      "zip2 v5.16b, v20.16b, v16.16b\n"
+      "ldr q4, [x11], #0x10\n"
+      "ldr q3, [x10], #0x10\n"
+      "zip1 v16.16b, v19.16b, v17.16b\n"
+      "ldr q2, [x9], #0x10\n"
+      "zip2 v1.16b, v19.16b, v17.16b\n"
+      "ldr q0, [x28], #0x10\n"
+      "zip1 v22.16b, v18.16b, v16.16b\n"
+      "ldr q31, [x27], #0x10\n"
+      "zip2 v21.16b, v18.16b, v16.16b\n"
+      "ldr q30, [x26], #0x10\n"
+      "zip1 v29.16b, v5.16b, v1.16b\n"
+      "ldr q28, [x25], #0x10\n"
+      "zip1 v17.16b, v4.16b, v2.16b\n"
+      "ldr q27, [x24], #0x10\n"
+      "zip1 v16.16b, v3.16b, v0.16b\n"
+      "ldr q26, [x23], #0x10\n"
+      "zip1 v19.16b, v17.16b, v16.16b\n"
+      "ldr q25, [x22], #0x10\n"
+      "zip2 v20.16b, v17.16b, v16.16b\n"
+      "ldr q24, [x21], #0x10\n"
+      "zip1 v18.16b, v31.16b, v28.16b\n"
+      "ldr q23, [x20], #0x10\n"
+      "zip1 v17.16b, v30.16b, v27.16b\n"
+      "str q22, [x15, #0x0]\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q19, [x15, #0x10]\n"
+      "zip2 v19.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x20]\n"
+      "zip1 v18.16b, v26.16b, v24.16b\n"
+      "zip1 v17.16b, v25.16b, v23.16b\n"
+      "zip1 v16.16b, v18.16b, v17.16b\n"
+      "str q16, [x15, #0x30]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "zip2 v17.16b, v18.16b, v17.16b\n"
+      "str q21, [x15, #0x0]\n"
+      "zip2 v22.16b, v4.16b, v2.16b\n"
+      "str q20, [x15, #0x10]\n"
+      "zip2 v21.16b, v3.16b, v0.16b\n"
+      "str q19, [x15, #0x20]\n"
+      "zip1 v16.16b, v22.16b, v21.16b\n"
+      "str q17, [x15, #0x30]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "zip2 v20.16b, v31.16b, v28.16b\n"
+      "str q29, [x15, #0x0]\n"
+      "zip2 v17.16b, v30.16b, v27.16b\n"
+      "str q16, [x15, #0x10]\n"
+      "zip1 v16.16b, v20.16b, v17.16b\n"
+      "str q16, [x15, #0x20]\n"
+      "zip2 v19.16b, v26.16b, v24.16b\n"
+      "zip2 v18.16b, v25.16b, v23.16b\n"
+      "zip1 v16.16b, v19.16b, v18.16b\n"
+      "str q16, [x15, #0x30]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "zip2 v16.16b, v5.16b, v1.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "zip2 v16.16b, v22.16b, v21.16b\n"
+      "zip2 v17.16b, v20.16b, v17.16b\n"
+      "str q16, [x15, #0x10]\n"
+      "zip2 v16.16b, v19.16b, v18.16b\n"
+      "str q17, [x15, #0x20]\n"
+      "str q16, [x15, #0x30]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cmp x19, #0x4\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: Column loop
+      "ldr s18, [x16], #0x4\n"
+      "sub x19, x19, #0x4\n"
+      "ldr s17, [x14], #0x4\n"
+      "cmp x19, #0x4\n"
+      "ldr s16, [x13], #0x4\n"
+      "zip1 v19.16b, v18.16b, v16.16b\n"
+      "ldr s16, [x12], #0x4\n"
+      "ldr s18, [x11], #0x4\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "ldr s20, [x10], #0x4\n"
+      "ldr s17, [x9], #0x4\n"
+      "zip1 v23.16b, v19.16b, v16.16b\n"
+      "ldr s16, [x28], #0x4\n"
+      "zip1 v19.16b, v18.16b, v17.16b\n"
+      "ldr s18, [x27], #0x4\n"
+      "ldr s22, [x26], #0x4\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "ldr s17, [x25], #0x4\n"
+      "zip1 v21.16b, v19.16b, v16.16b\n"
+      "ldr s16, [x24], #0x4\n"
+      "zip1 v18.16b, v18.16b, v17.16b\n"
+      "ldr s20, [x23], #0x4\n"
+      "ldr s19, [x22], #0x4\n"
+      "zip1 v16.16b, v22.16b, v16.16b\n"
+      "ldr s17, [x21], #0x4\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr s16, [x20], #0x4\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "str q23, [x15, #0x0]\n"
+      "str q21, [x15, #0x10]\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "str q18, [x15, #0x20]\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x15, #0x30]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp x19, #0x1\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 1 loop: loop
+      "ldr b18, [x16], #0x1\n"
+      "sub x19, x19, #0x1\n"
+      "ldr b17, [x14], #0x1\n"
+      "cmp x19, #0x1\n"
+      "ldr b16, [x13], #0x1\n"
+      "zip1 v19.16b, v18.16b, v16.16b\n"
+      "ldr b16, [x12], #0x1\n"
+      "ldr b18, [x11], #0x1\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "ldr b20, [x10], #0x1\n"
+      "ldr b17, [x9], #0x1\n"
+      "zip1 v23.16b, v19.16b, v16.16b\n"
+      "ldr b16, [x28], #0x1\n"
+      "zip1 v19.16b, v18.16b, v17.16b\n"
+      "ldr b18, [x27], #0x1\n"
+      "ldr b22, [x26], #0x1\n"
+      "zip1 v16.16b, v20.16b, v16.16b\n"
+      "ldr b17, [x25], #0x1\n"
+      "zip1 v21.16b, v19.16b, v16.16b\n"
+      "ldr b16, [x24], #0x1\n"
+      "zip1 v18.16b, v18.16b, v17.16b\n"
+      "ldr b20, [x23], #0x1\n"
+      "ldr b19, [x22], #0x1\n"
+      "zip1 v16.16b, v22.16b, v16.16b\n"
+      "ldr b17, [x21], #0x1\n"
+      "zip1 v18.16b, v18.16b, v16.16b\n"
+      "ldr b16, [x20], #0x1\n"
+      "zip1 v17.16b, v20.16b, v17.16b\n"
+      "str s23, [x15, #0x0]\n"
+      "str s21, [x15, #0x10]\n"
+      "zip1 v16.16b, v19.16b, v16.16b\n"
+      "str s18, [x15, #0x20]\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s16, [x15, #0x30]\n"
+      "add x15, x15, #0x4\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x40\n"
+      "cmp %x[height], #0x10\n"
+      "bge 1b\n"
+      "cbz %x[height], 16f\n"
+      "8:"  // Main loop skip
+
+      "9:"  // Tail row loop: Head
+      "mov x16, %x[in]\n"
+      "mov x15, %x[out]\n"
+      "add x14, x16, %x[in_stride]\n"
+      "add x13, x14, %x[in_stride]\n"
+      "add x12, x13, %x[in_stride]\n"
+      "add %x[in], x12, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x12, x12, %x[pad_row], GT\n"
+      "csel x13, x13, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x14, x14, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x10\n"
+      "blt 11f\n"
+      "10:"  // Tail row loop: Unroll column loop
+      "ldr q19, [x16], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q18, [x14], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q17, [x13], #0x10\n"
+      "zip1 v20.16b, v19.16b, v17.16b\n"
+      "ldr q16, [x12], #0x10\n"
+      "zip2 v19.16b, v19.16b, v17.16b\n"
+      "zip1 v17.16b, v18.16b, v16.16b\n"
+      "zip2 v18.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v20.16b, v17.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "zip2 v16.16b, v20.16b, v17.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "zip1 v17.16b, v19.16b, v18.16b\n"
+      "add x15, x15, %x[out_stride]\n"
+      "zip2 v16.16b, v19.16b, v18.16b\n"
+      "str q17, [x15, #0x0]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "str q16, [x15, #0x0]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "bge 10b\n"
+      "11:"  // Tail row loop: Unroll column loop skip
+      "cmp x19, #0x4\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr s17, [x16], #0x4\n"
+      "sub x19, x19, #0x4\n"
+      "ldr s18, [x14], #0x4\n"
+      "cmp x19, #0x4\n"
+      "ldr s16, [x13], #0x4\n"
+      "zip1 v17.16b, v17.16b, v16.16b\n"
+      "ldr s16, [x12], #0x4\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str q16, [x15, #0x0]\n"
+      "add x15, x15, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x1\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 1 loop: loop
+      "ldr b17, [x16], #0x1\n"
+      "sub x19, x19, #0x1\n"
+      "ldr b18, [x14], #0x1\n"
+      "cmp x19, #0x1\n"
+      "ldr b16, [x13], #0x1\n"
+      "zip1 v17.16b, v17.16b, v16.16b\n"
+      "ldr b16, [x12], #0x1\n"
+      "zip1 v16.16b, v18.16b, v16.16b\n"
+      "zip1 v16.16b, v17.16b, v16.16b\n"
+      "str s16, [x15, #0x0]\n"
+      "add x15, x15, #0x4\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x10\n"
+      "cmp %x[height], #0x1\n"
+      "bge 9b\n"
+      "16:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 4, true, VLType::None>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_4_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 4, true, VLType::None>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_4_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
new file mode 100644
index 0000000000..c341b315aa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_64.hpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_64(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 32 * height * sizeof(uint16_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add x22, x24, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x20\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q31, [x24], #0x10\n"
+      "sub x19, x19, #0x20\n"
+      "ldr q30, [x22], #0x10\n"
+      "cmp x19, #0x20\n"
+      "ldr q29, [x21], #0x10\n"
+      "ldr q28, [x20], #0x10\n"
+      "ldr q27, [x24], #0x10\n"
+      "ldr q26, [x22], #0x10\n"
+      "ldr q25, [x21], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x24], #0x10\n"
+      "ldr q22, [x22], #0x10\n"
+      "ldr q21, [x21], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q31, [x23, #0x0]\n"
+      "str q27, [x23, #0x10]\n"
+      "str q23, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q30, [x23, #0x40]\n"
+      "str q26, [x23, #0x50]\n"
+      "str q22, [x23, #0x60]\n"
+      "str q18, [x23, #0x70]\n"
+      "str q29, [x23, #0x80]\n"
+      "str q25, [x23, #0x90]\n"
+      "str q21, [x23, #0xa0]\n"
+      "str q17, [x23, #0xb0]\n"
+      "str q28, [x23, #0xc0]\n"
+      "str q24, [x23, #0xd0]\n"
+      "str q20, [x23, #0xe0]\n"
+      "str q16, [x23, #0xf0]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x19, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 16 loop: loop
+      "ldr q23, [x24], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q22, [x22], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q21, [x21], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q23, [x23, #0x0]\n"
+      "str q19, [x23, #0x10]\n"
+      "str q22, [x23, #0x40]\n"
+      "str q18, [x23, #0x50]\n"
+      "str q21, [x23, #0x80]\n"
+      "str q17, [x23, #0x90]\n"
+      "str q20, [x23, #0xc0]\n"
+      "str q16, [x23, #0xd0]\n"
+      "add x23, x23, #0x20\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 16 loop: skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr d19, [x24], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "ldr d18, [x22], #0x8\n"
+      "cmp x19, #0x4\n"
+      "ldr d17, [x21], #0x8\n"
+      "ldr d16, [x20], #0x8\n"
+      "str d19, [x23, #0x0]\n"
+      "str d18, [x23, #0x40]\n"
+      "str d17, [x23, #0x80]\n"
+      "str d16, [x23, #0xc0]\n"
+      "add x23, x23, #0x8\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr h19, [x24], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "ldr h18, [x22], #0x2\n"
+      "cmp x19, #0x1\n"
+      "ldr h17, [x21], #0x2\n"
+      "ldr h16, [x20], #0x2\n"
+      "str h19, [x23, #0x0]\n"
+      "str h18, [x23, #0x40]\n"
+      "str h17, [x23, #0x80]\n"
+      "str h16, [x23, #0xc0]\n"
+      "add x23, x23, #0x2\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x100\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x20\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q19, [x24], #0x10\n"
+      "sub x19, x19, #0x20\n"
+      "cmp x19, #0x20\n"
+      "ldr q18, [x24], #0x10\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x24], #0x10\n"
+      "str q19, [x23, #0x0]\n"
+      "str q18, [x23, #0x10]\n"
+      "str q17, [x23, #0x20]\n"
+      "str q16, [x23, #0x30]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 16 loop: loop
+      "ldr q17, [x24], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q16, [x24], #0x10\n"
+      "str q17, [x23, #0x0]\n"
+      "str q16, [x23, #0x10]\n"
+      "add x23, x23, #0x20\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 16 loop: skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr d16, [x24], #0x8\n"
+      "sub x19, x19, #0x4\n"
+      "cmp x19, #0x4\n"
+      "str d16, [x23, #0x0]\n"
+      "add x23, x23, #0x8\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr h16, [x24], #0x2\n"
+      "sub x19, x19, #0x1\n"
+      "cmp x19, #0x1\n"
+      "str h16, [x23, #0x0]\n"
+      "add x23, x23, #0x2\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x40\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<16, 1, true, VLType::None>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_64(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<32, 1, true, VLType::None>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_64(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<32, 1, true, VLType::None>(
+    uint16_t *out, const uint16_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_64(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint16_t) / 2,
+        stride * sizeof(uint16_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
deleted file mode 100644
index df68740bb4..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "transpose_interleave_common.hpp"
-
-// Generic unblocked transposed 8x32-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a 16 x uint16_t specialisation
-  TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t *>(in),
-    stride*2, x0*2, xmax*2, k0, kmax
-  );
-}
-
-// Generic 16x16-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
-    T* out, const T* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  // Redirect to a uint16_t specialisation
-  Transform(
-    reinterpret_cast<uint16_t *>(out),
-    reinterpret_cast<const uint16_t *>(in),
-    stride, x0, xmax, k0, kmax
-  );
-}
-
-// Specialised 16 x uint16_t version
-template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *const out) {
-  __asm volatile (
-    "LDR q0, [%[in0]]\n"
-    "STR q0, [%[out]]\n"
-    "LDR q1, [%[in0], #0x10]\n"
-    "STR q1, [%[out], #0x10]\n"
-    "ADD %x[in0], %x[in0], #0x20\n"
-    ASM_PREFETCH("[%[in0], #192]")
-    : [in0] "+r" (in0)
-    : [out] "r" (out)
-    : "v0", "v1", "memory"
-  );
-}
-
-template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *const out) {
-  __asm volatile (
-    "LDR q0, [%[in0]]\n"
-    "STR q0, [%[out]]\n"
-    "LDR q1, [%[in0], #0x10]\n"
-    "STR q1, [%[out], #0x10]\n"
-    "ADD %x[in0], %x[in0], #0x20\n"
-    ASM_PREFETCH("[%[in0], #192]")
-
-    "LDR q2, [%[in1]]\n"
-    "STR q2, [%[out], #0x20]\n"
-    "LDR q3, [%[in1], #0x10]\n"
-    "STR q3, [%[out], #0x30]\n"
-    "ADD %x[in1], %x[in1], #0x20\n"
-    ASM_PREFETCH("[%[in1], #192]")
-    : [in0] "+r" (in0),
-      [in1] "+r" (in1)
-    : [out] "r" (out)
-    : "v0", "v1", "v2", "v3", "memory"
-  );
-}
-
-template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *const out) {
-  __asm __volatile (
-    "LDR q0, [%[in0]]\n"
-    "STR q0, [%[out]]\n"
-    "LDR q1, [%[in0], #0x10]\n"
-    "STR q1, [%[out], #0x10]\n"
-    "ADD %x[in0], %x[in0], #0x20\n"
-    ASM_PREFETCH("[%[in0], #192]")
-
-    "LDR q2, [%[in1]]\n"
-    "STR q2, [%[out], #0x20]\n"
-    "LDR q3, [%[in1], #0x10]\n"
-    "STR q3, [%[out], #0x30]\n"
-    "ADD %x[in1], %x[in1], #0x20\n"
-    ASM_PREFETCH("[%[in1], #192]")
-
-    "LDR q0, [%[in2]]\n"
-    "STR q0, [%[out], #0x40]\n"
-    "LDR q1, [%[in2], #0x10]\n"
-    "STR q1, [%[out], #0x50]\n"
-    "ADD %x[in2], %x[in2], #0x20\n"
-    ASM_PREFETCH("[%[in2], #192]")
-
-    "LDR q2, [%[in3]]\n"
-    "STR q2, [%[out], #0x60]\n"
-    "LDR q3, [%[in3], #0x10]\n"
-    "STR q3, [%[out], #0x70]\n"
-    "ADD %x[in3], %x[in3], #0x20\n"
-    ASM_PREFETCH("[%[in3], #192]")
-    : [in0] "+r" (in0),
-      [in1] "+r" (in1),
-      [in2] "+r" (in2),
-      [in3] "+r" (in3)
-    : [out] "r" (out)
-    : "v0", "v1", "v2", "v3", "memory"
-  );
-}
-
-template <>
-template <>
-inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
-    uint16_t* out, const uint16_t* const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax
-) {
-  TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
new file mode 100644
index 0000000000..190999ba53
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_96.hpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __aarch64__
+
+namespace {
+
+void a64_transpose_interleave_96(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 24 * height * sizeof(uint32_t);
+
+    __asm__ __volatile__(
+      "cmp %x[height], #0x4\n"
+      "blt 10f\n"
+      "1:"  // Main row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add x22, x24, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add x20, x21, %x[in_stride]\n"
+      "add %x[in], x20, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Column loop
+      "ldr q7, [x24], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "ldr q6, [x22], #0x10\n"
+      "cmp x19, #0x18\n"
+      "ldr q5, [x21], #0x10\n"
+      "ldr q4, [x20], #0x10\n"
+      "ldr q3, [x24], #0x10\n"
+      "ldr q2, [x22], #0x10\n"
+      "ldr q1, [x21], #0x10\n"
+      "ldr q0, [x20], #0x10\n"
+      "ldr q31, [x24], #0x10\n"
+      "ldr q30, [x22], #0x10\n"
+      "ldr q29, [x21], #0x10\n"
+      "ldr q28, [x20], #0x10\n"
+      "ldr q27, [x24], #0x10\n"
+      "ldr q26, [x22], #0x10\n"
+      "ldr q25, [x21], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x24], #0x10\n"
+      "ldr q22, [x22], #0x10\n"
+      "ldr q21, [x21], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q7, [x23, #0x0]\n"
+      "str q3, [x23, #0x10]\n"
+      "str q31, [x23, #0x20]\n"
+      "str q27, [x23, #0x30]\n"
+      "str q23, [x23, #0x40]\n"
+      "str q19, [x23, #0x50]\n"
+      "str q6, [x23, #0x60]\n"
+      "str q2, [x23, #0x70]\n"
+      "str q30, [x23, #0x80]\n"
+      "str q26, [x23, #0x90]\n"
+      "str q22, [x23, #0xa0]\n"
+      "str q18, [x23, #0xb0]\n"
+      "str q5, [x23, #0xc0]\n"
+      "str q1, [x23, #0xd0]\n"
+      "str q29, [x23, #0xe0]\n"
+      "str q25, [x23, #0xf0]\n"
+      "str q21, [x23, #0x100]\n"
+      "str q17, [x23, #0x110]\n"
+      "str q4, [x23, #0x120]\n"
+      "str q0, [x23, #0x130]\n"
+      "str q28, [x23, #0x140]\n"
+      "str q24, [x23, #0x150]\n"
+      "str q20, [x23, #0x160]\n"
+      "str q16, [x23, #0x170]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "cmp x19, #0x10\n"
+      "blt 5f\n"
+      "4:"  // Main row loop: width 16 loop: loop
+      "ldr q31, [x24], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "ldr q30, [x22], #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q29, [x21], #0x10\n"
+      "ldr q28, [x20], #0x10\n"
+      "ldr q27, [x24], #0x10\n"
+      "ldr q26, [x22], #0x10\n"
+      "ldr q25, [x21], #0x10\n"
+      "ldr q24, [x20], #0x10\n"
+      "ldr q23, [x24], #0x10\n"
+      "ldr q22, [x22], #0x10\n"
+      "ldr q21, [x21], #0x10\n"
+      "ldr q20, [x20], #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q18, [x22], #0x10\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q31, [x23, #0x0]\n"
+      "str q27, [x23, #0x10]\n"
+      "str q23, [x23, #0x20]\n"
+      "str q19, [x23, #0x30]\n"
+      "str q30, [x23, #0x60]\n"
+      "str q26, [x23, #0x70]\n"
+      "str q22, [x23, #0x80]\n"
+      "str q18, [x23, #0x90]\n"
+      "str q29, [x23, #0xc0]\n"
+      "str q25, [x23, #0xd0]\n"
+      "str q21, [x23, #0xe0]\n"
+      "str q17, [x23, #0xf0]\n"
+      "str q28, [x23, #0x120]\n"
+      "str q24, [x23, #0x130]\n"
+      "str q20, [x23, #0x140]\n"
+      "str q16, [x23, #0x150]\n"
+      "add x23, x23, #0x40\n"
+      "bge 4b\n"
+      "5:"  // Main row loop: width 16 loop: skip
+      "cmp x19, #0x4\n"
+      "blt 7f\n"
+      "6:"  // Main row loop: width 4 loop: loop
+      "ldr q19, [x24], #0x10\n"
+      "sub x19, x19, #0x4\n"
+      "ldr q18, [x22], #0x10\n"
+      "cmp x19, #0x4\n"
+      "ldr q17, [x21], #0x10\n"
+      "ldr q16, [x20], #0x10\n"
+      "str q19, [x23, #0x0]\n"
+      "str q18, [x23, #0x60]\n"
+      "str q17, [x23, #0xc0]\n"
+      "str q16, [x23, #0x120]\n"
+      "add x23, x23, #0x10\n"
+      "bge 6b\n"
+      "7:"  // Main row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 9f\n"
+      "8:"  // Main row loop: width 1 loop: loop
+      "ldr s19, [x24], #0x4\n"
+      "sub x19, x19, #0x1\n"
+      "ldr s18, [x22], #0x4\n"
+      "cmp x19, #0x1\n"
+      "ldr s17, [x21], #0x4\n"
+      "ldr s16, [x20], #0x4\n"
+      "str s19, [x23, #0x0]\n"
+      "str s18, [x23, #0x60]\n"
+      "str s17, [x23, #0xc0]\n"
+      "str s16, [x23, #0x120]\n"
+      "add x23, x23, #0x4\n"
+      "bge 8b\n"
+      "9:"  // Main row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x180\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 20f\n"
+      "10:"  // Main loop skip
+
+      "11:"  // Tail row loop: Head
+      "mov x24, %x[in]\n"
+      "mov x23, %x[out]\n"
+      "add %x[in], x24, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x19, %x[width]\n"
+      "cmp x19, #0x18\n"
+      "blt 13f\n"
+      "12:"  // Tail row loop: Column loop
+      "ldr q21, [x24], #0x10\n"
+      "sub x19, x19, #0x18\n"
+      "cmp x19, #0x18\n"
+      "ldr q20, [x24], #0x10\n"
+      "ldr q19, [x24], #0x10\n"
+      "ldr q18, [x24], #0x10\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x24], #0x10\n"
+      "str q21, [x23, #0x0]\n"
+      "str q20, [x23, #0x10]\n"
+      "str q19, [x23, #0x20]\n"
+      "str q18, [x23, #0x30]\n"
+      "str q17, [x23, #0x40]\n"
+      "str q16, [x23, #0x50]\n"
+      "add x23, x23, %x[out_stride]\n"
+      "bge 12b\n"
+      "13:"  // Tail row loop: Column loop skip
+      "cmp x19, #0x10\n"
+      "blt 15f\n"
+      "14:"  // Tail row loop: width 16 loop: loop
+      "ldr q19, [x24], #0x10\n"
+      "sub x19, x19, #0x10\n"
+      "cmp x19, #0x10\n"
+      "ldr q18, [x24], #0x10\n"
+      "ldr q17, [x24], #0x10\n"
+      "ldr q16, [x24], #0x10\n"
+      "str q19, [x23, #0x0]\n"
+      "str q18, [x23, #0x10]\n"
+      "str q17, [x23, #0x20]\n"
+      "str q16, [x23, #0x30]\n"
+      "add x23, x23, #0x40\n"
+      "bge 14b\n"
+      "15:"  // Tail row loop: width 16 loop: skip
+      "cmp x19, #0x4\n"
+      "blt 17f\n"
+      "16:"  // Tail row loop: width 4 loop: loop
+      "ldr q16, [x24], #0x10\n"
+      "sub x19, x19, #0x4\n"
+      "cmp x19, #0x4\n"
+      "str q16, [x23, #0x0]\n"
+      "add x23, x23, #0x10\n"
+      "bge 16b\n"
+      "17:"  // Tail row loop: width 4 loop: skip
+      "cmp x19, #0x1\n"
+      "blt 19f\n"
+      "18:"  // Tail row loop: width 1 loop: loop
+      "ldr s16, [x24], #0x4\n"
+      "sub x19, x19, #0x1\n"
+      "cmp x19, #0x1\n"
+      "str s16, [x23, #0x0]\n"
+      "add x23, x23, #0x4\n"
+      "bge 18b\n"
+      "19:"  // Tail row loop: width 1 loop: skip
+      "add %x[out], %x[out], #0x60\n"
+      "cmp %x[height], #0x1\n"
+      "bge 11b\n"
+      "20:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<24, 1, true, VLType::None>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    a64_transpose_interleave_96(
+        reinterpret_cast<uint32_t *>(out),
+        reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 4,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp
new file mode 100644
index 0000000000..895177b6cc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list-sve.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "sve_transpose_interleave_12VL_2x4_fp32bf16.hpp"
+#include "sve_transpose_interleave_1VL_1x4.hpp"
+#include "sve_transpose_interleave_1VL.hpp"
+#include "sve_transpose_interleave_3VL_1x4.hpp"
+#include "sve_transpose_interleave_3VL_2x2.hpp"
+#include "sve_transpose_interleave_3VL.hpp"
+#include "sve_transpose_interleave_4VL_1x4.hpp"
+#include "sve_transpose_interleave_4VL_2x2.hpp"
+#include "sve_transpose_interleave_4VL.hpp"
+#include "sve_transpose_interleave_6VL_1x8.hpp"
+#include "sve_transpose_interleave_6VL_2x4_fp32bf16.hpp"
+#include "sve_transpose_interleave_6VL_2x4.hpp"
+#include "sve_transpose_interleave_6VL_4x2.hpp"
+#include "sve_transpose_interleave_8VL_1x4.hpp"
+#include "sve_transpose_interleave_8VL_1x8.hpp"
+#include "sve_transpose_interleave_8VL_2x2.hpp"
+#include "sve_transpose_interleave_8VL_2x4.hpp"
+#include "sve_transpose_interleave_8VL_2x4_fp32bf16.hpp"
+#include "sve_transpose_interleave_8VL.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index e092c729ba..adbaa6cf2f 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -22,7 +22,28 @@
  * SOFTWARE.
  */
 #include "a32_transpose_interleave_8way_32bit.hpp"
-#include "a64_transpose_interleave_12way_16bit.hpp"
-#include "a64_transpose_interleave_12way_half_to_float.hpp"
-#include "a64_transpose_interleave_24way_16bit.hpp"
-#include "a64_transpose_interleave_8way_32bit.hpp"
+#include "a64_transpose_interleave_12_1x4.hpp"
+#include "a64_transpose_interleave_12_1x8.hpp"
+#include "a64_transpose_interleave_12_2x2.hpp"
+#include "a64_transpose_interleave_12_2x4_fp32bf16.hpp"
+#include "a64_transpose_interleave_12_2x4.hpp"
+#include "a64_transpose_interleave_128.hpp"
+#include "a64_transpose_interleave_12_s8s16.hpp"
+#include "a64_transpose_interleave_12_u8u16.hpp"
+#include "a64_transpose_interleave_16_1x4.hpp"
+#include "a64_transpose_interleave_16_1x8.hpp"
+#include "a64_transpose_interleave_16_2x2.hpp"
+#include "a64_transpose_interleave_16_2x4.hpp"
+#include "a64_transpose_interleave_16_2x4_fp32bf16.hpp"
+#include "a64_transpose_interleave_16.hpp"
+#include "a64_transpose_interleave_24_bf16fp32.hpp"
+#include "a64_transpose_interleave_24_fp16fp32.hpp"
+#include "a64_transpose_interleave_24_2x4_fp32bf16.hpp"
+#include "a64_transpose_interleave_24.hpp"
+#include "a64_transpose_interleave_32_1x4.hpp"
+#include "a64_transpose_interleave_32_2x2.hpp"
+#include "a64_transpose_interleave_4_1x16.hpp"
+#include "a64_transpose_interleave_4_1x4.hpp"
+#include "a64_transpose_interleave_48.hpp"
+#include "a64_transpose_interleave_64.hpp"
+#include "a64_transpose_interleave_96.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..ef94cbad39
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_12VL_2x4_fp32bf16.hpp
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_12VL_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 12 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p6.b\n"
+      "1:"  // Main row loop: Head
+      "mov x27, %x[in]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "mov x24, %x[width]\n"
+      "cnth x23, ALL, MUL #6\n"
+      "add x22, x25, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x22, %x[in_stride]\n"
+      "csel x22, x22, %x[pad_row], GT\n"
+      "csel x25, x25, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x26, x26, %x[pad_row], GT\n"
+      "cmp x24, x23\n"
+      "mov x21, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z22.s }, p6/Z, [x27]\n"
+      "ld1w { z7.s }, p6/Z, [x27, #1, MUL VL]\n"
+      "mov x20, x21\n"
+      "add x21, x21, %x[out_stride]\n"
+      "ld1w { z19.s }, p6/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z18.s }, p6/Z, [x27, #3, MUL VL]\n"
+      "mov x19, x21\n"
+      "sub x24, x24, x23\n"
+      "ld1w { z5.s }, p6/Z, [x27, #4, MUL VL]\n"
+      "ld1w { z25.s }, p6/Z, [x27, #5, MUL VL]\n"
+      "cmp x24, x23\n"
+      "add x21, x21, %x[out_stride]\n"
+      "ld1w { z20.s }, p6/Z, [x27, #6, MUL VL]\n"
+      "ld1w { z23.s }, p6/Z, [x27, #7, MUL VL]\n"
+      "addvl x27, x27, #12\n"
+      "ld1w { z4.s }, p6/Z, [x25]\n"
+      "ld1w { z10.s }, p6/Z, [x25, #1, MUL VL]\n"
+      "zip1 z14.s, z22.s, z4.s\n"
+      "zip2 z22.s, z22.s, z4.s\n"
+      "ld1w { z28.s }, p6/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z27.s }, p6/Z, [x25, #3, MUL VL]\n"
+      "zip1 z24.s, z7.s, z10.s\n"
+      "zip2 z15.s, z7.s, z10.s\n"
+      "ld1w { z7.s }, p6/Z, [x25, #4, MUL VL]\n"
+      "ld1w { z2.s }, p6/Z, [x25, #5, MUL VL]\n"
+      "zip1 z9.s, z19.s, z28.s\n"
+      "zip2 z0.s, z19.s, z28.s\n"
+      "ld1w { z19.s }, p6/Z, [x25, #6, MUL VL]\n"
+      "ld1w { z16.s }, p6/Z, [x25, #7, MUL VL]\n"
+      "addvl x25, x25, #12\n"
+      "zip1 z1.s, z18.s, z27.s\n"
+      "ld1w { z30.s }, p6/Z, [x27, #-4, MUL VL]\n"
+      "ld1w { z29.s }, p6/Z, [x27, #-3, MUL VL]\n"
+      "zip2 z17.s, z18.s, z27.s\n"
+      ".inst 0x658ab9d5  // bfcvt z21.h, p6/M, z14.s\n"
+      "ld1w { z31.s }, p6/Z, [x26]\n"
+      "ld1w { z8.s }, p6/Z, [x26, #1, MUL VL]\n"
+      ".inst 0x658abacc  // bfcvt z12.h, p6/M, z22.s\n"
+      ".inst 0x658abb0e  // bfcvt z14.h, p6/M, z24.s\n"
+      "ld1w { z22.s }, p6/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z28.s }, p6/Z, [x26, #3, MUL VL]\n"
+      ".inst 0x658ab9ea  // bfcvt z10.h, p6/M, z15.s\n"
+      ".inst 0x658ab92f  // bfcvt z15.h, p6/M, z9.s\n"
+      "ld1w { z27.s }, p6/Z, [x26, #4, MUL VL]\n"
+      "ld1w { z13.s }, p6/Z, [x26, #5, MUL VL]\n"
+      ".inst 0x658ab803  // bfcvt z3.h, p6/M, z0.s\n"
+      ".inst 0x658ab832  // bfcvt z18.h, p6/M, z1.s\n"
+      "ld1w { z26.s }, p6/Z, [x26, #6, MUL VL]\n"
+      "ld1w { z9.s }, p6/Z, [x26, #7, MUL VL]\n"
+      "addvl x26, x26, #12\n"
+      ".inst 0x658aba26  // bfcvt z6.h, p6/M, z17.s\n"
+      "ld1w { z1.s }, p6/Z, [x25, #-4, MUL VL]\n"
+      "ld1w { z0.s }, p6/Z, [x25, #-3, MUL VL]\n"
+      "zip1 z17.s, z5.s, z7.s\n"
+      "zip2 z5.s, z5.s, z7.s\n"
+      "ld1w { z24.s }, p6/Z, [x22]\n"
+      "ld1w { z11.s }, p6/Z, [x22, #1, MUL VL]\n"
+      "zip1 z7.s, z31.s, z24.s\n"
+      "zip2 z31.s, z31.s, z24.s\n"
+      "ld1w { z4.s }, p6/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z24.s }, p6/Z, [x22, #3, MUL VL]\n"
+      ".inst 0x648ab8f5  // bfcvtnt z21.h, p6/M, z7.s\n"
+      "zip1 z7.s, z8.s, z11.s\n"
+      "zip2 z11.s, z8.s, z11.s\n"
+      "ld1w { z8.s }, p6/Z, [x22, #4, MUL VL]\n"
+      ".inst 0x648abbec  // bfcvtnt z12.h, p6/M, z31.s\n"
+      "ld1w { z31.s }, p6/Z, [x22, #5, MUL VL]\n"
+      ".inst 0x648ab8ee  // bfcvtnt z14.h, p6/M, z7.s\n"
+      "ld1w { z7.s }, p6/Z, [x22, #6, MUL VL]\n"
+      ".inst 0x648ab96a  // bfcvtnt z10.h, p6/M, z11.s\n"
+      "zip1 z11.s, z22.s, z4.s\n"
+      "zip2 z4.s, z22.s, z4.s\n"
+      "ld1w { z22.s }, p6/Z, [x22, #7, MUL VL]\n"
+      "addvl x22, x22, #12\n"
+      ".inst 0x648ab96f  // bfcvtnt z15.h, p6/M, z11.s\n"
+      "ld1w { z11.s }, p6/Z, [x27, #-2, MUL VL]\n"
+      ".inst 0x648ab883  // bfcvtnt z3.h, p6/M, z4.s\n"
+      "zip1 z4.s, z28.s, z24.s\n"
+      "zip2 z24.s, z28.s, z24.s\n"
+      "ld1w { z28.s }, p6/Z, [x27, #-1, MUL VL]\n"
+      ".inst 0x648ab892  // bfcvtnt z18.h, p6/M, z4.s\n"
+      "ld1w { z4.s }, p6/Z, [x26, #-4, MUL VL]\n"
+      ".inst 0x648abb06  // bfcvtnt z6.h, p6/M, z24.s\n"
+      "zip1 z24.s, z25.s, z2.s\n"
+      "zip2 z25.s, z25.s, z2.s\n"
+      "zip1 z2.s, z20.s, z19.s\n"
+      "zip2 z20.s, z20.s, z19.s\n"
+      "zip1 z19.s, z23.s, z16.s\n"
+      "zip2 z16.s, z23.s, z16.s\n"
+      "zip1 z23.s, z30.s, z1.s\n"
+      "zip2 z30.s, z30.s, z1.s\n"
+      "zip1 z1.s, z29.s, z0.s\n"
+      "zip2 z0.s, z29.s, z0.s\n"
+      ".inst 0x658aba31  // bfcvt z17.h, p6/M, z17.s\n"
+      "zip1 z29.s, z27.s, z8.s\n"
+      ".inst 0x658ab8a5  // bfcvt z5.h, p6/M, z5.s\n"
+      "zip2 z27.s, z27.s, z8.s\n"
+      "ld1w { z8.s }, p6/Z, [x26, #-3, MUL VL]\n"
+      ".inst 0x658abb18  // bfcvt z24.h, p6/M, z24.s\n"
+      ".inst 0x658abb39  // bfcvt z25.h, p6/M, z25.s\n"
+      ".inst 0x658ab842  // bfcvt z2.h, p6/M, z2.s\n"
+      ".inst 0x658aba94  // bfcvt z20.h, p6/M, z20.s\n"
+      ".inst 0x658aba73  // bfcvt z19.h, p6/M, z19.s\n"
+      ".inst 0x658aba10  // bfcvt z16.h, p6/M, z16.s\n"
+      ".inst 0x658abaf7  // bfcvt z23.h, p6/M, z23.s\n"
+      ".inst 0x658abbde  // bfcvt z30.h, p6/M, z30.s\n"
+      ".inst 0x658ab821  // bfcvt z1.h, p6/M, z1.s\n"
+      ".inst 0x658ab800  // bfcvt z0.h, p6/M, z0.s\n"
+      ".inst 0x648abbb1  // bfcvtnt z17.h, p6/M, z29.s\n"
+      "ld1w { z29.s }, p6/Z, [x25, #-2, MUL VL]\n"
+      ".inst 0x648abb65  // bfcvtnt z5.h, p6/M, z27.s\n"
+      "zip1 z27.s, z13.s, z31.s\n"
+      "zip2 z31.s, z13.s, z31.s\n"
+      "ld1w { z13.s }, p6/Z, [x25, #-1, MUL VL]\n"
+      ".inst 0x648abb78  // bfcvtnt z24.h, p6/M, z27.s\n"
+      "ld1w { z27.s }, p6/Z, [x22, #-4, MUL VL]\n"
+      ".inst 0x648abbf9  // bfcvtnt z25.h, p6/M, z31.s\n"
+      "zip1 z31.s, z26.s, z7.s\n"
+      "zip2 z26.s, z26.s, z7.s\n"
+      "ld1w { z7.s }, p6/Z, [x22, #-3, MUL VL]\n"
+      ".inst 0x648abbe2  // bfcvtnt z2.h, p6/M, z31.s\n"
+      "ld1w { z31.s }, p6/Z, [x26, #-2, MUL VL]\n"
+      ".inst 0x648abb54  // bfcvtnt z20.h, p6/M, z26.s\n"
+      "zip1 z26.s, z9.s, z22.s\n"
+      "zip2 z9.s, z9.s, z22.s\n"
+      "ld1w { z22.s }, p6/Z, [x26, #-1, MUL VL]\n"
+      ".inst 0x648abb53  // bfcvtnt z19.h, p6/M, z26.s\n"
+      "ld1w { z26.s }, p6/Z, [x22, #-2, MUL VL]\n"
+      ".inst 0x648ab930  // bfcvtnt z16.h, p6/M, z9.s\n"
+      "ld1w { z9.s }, p6/Z, [x22, #-1, MUL VL]\n"
+      "st1h { z21.h }, p6, [x20]\n"
+      "zip1 z21.s, z4.s, z27.s\n"
+      "zip2 z27.s, z4.s, z27.s\n"
+      "zip1 z4.s, z8.s, z7.s\n"
+      "zip2 z8.s, z8.s, z7.s\n"
+      "st1h { z12.h }, p6, [x20, #1, MUL VL]\n"
+      "zip1 z7.s, z11.s, z29.s\n"
+      "zip2 z11.s, z11.s, z29.s\n"
+      "st1h { z14.h }, p6, [x20, #2, MUL VL]\n"
+      "zip1 z29.s, z28.s, z13.s\n"
+      "zip2 z12.s, z28.s, z13.s\n"
+      "st1h { z10.h }, p6, [x20, #3, MUL VL]\n"
+      "st1h { z15.h }, p6, [x20, #4, MUL VL]\n"
+      ".inst 0x648abab7  // bfcvtnt z23.h, p6/M, z21.s\n"
+      ".inst 0x648abb7e  // bfcvtnt z30.h, p6/M, z27.s\n"
+      "st1h { z3.h }, p6, [x20, #5, MUL VL]\n"
+      ".inst 0x648ab881  // bfcvtnt z1.h, p6/M, z4.s\n"
+      ".inst 0x648ab900  // bfcvtnt z0.h, p6/M, z8.s\n"
+      "st1h { z18.h }, p6, [x20, #6, MUL VL]\n"
+      ".inst 0x658ab8e8  // bfcvt z8.h, p6/M, z7.s\n"
+      "zip1 z27.s, z31.s, z26.s\n"
+      "st1h { z6.h }, p6, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #12\n"
+      ".inst 0x658ab96e  // bfcvt z14.h, p6/M, z11.s\n"
+      "zip2 z28.s, z31.s, z26.s\n"
+      ".inst 0x658abbbd  // bfcvt z29.h, p6/M, z29.s\n"
+      "zip1 z21.s, z22.s, z9.s\n"
+      "st1h { z17.h }, p6, [x20, #-4, MUL VL]\n"
+      ".inst 0x658ab992  // bfcvt z18.h, p6/M, z12.s\n"
+      "zip2 z17.s, z22.s, z9.s\n"
+      "st1h { z5.h }, p6, [x20, #-3, MUL VL]\n"
+      "st1h { z24.h }, p6, [x20, #-2, MUL VL]\n"
+      ".inst 0x648abb68  // bfcvtnt z8.h, p6/M, z27.s\n"
+      ".inst 0x648abb8e  // bfcvtnt z14.h, p6/M, z28.s\n"
+      "st1h { z25.h }, p6, [x20, #-1, MUL VL]\n"
+      ".inst 0x648ababd  // bfcvtnt z29.h, p6/M, z21.s\n"
+      ".inst 0x648aba32  // bfcvtnt z18.h, p6/M, z17.s\n"
+      "st1h { z2.h }, p6, [x19]\n"
+      "st1h { z20.h }, p6, [x19, #1, MUL VL]\n"
+      "st1h { z19.h }, p6, [x19, #2, MUL VL]\n"
+      "st1h { z16.h }, p6, [x19, #3, MUL VL]\n"
+      "st1h { z23.h }, p6, [x19, #4, MUL VL]\n"
+      "st1h { z30.h }, p6, [x19, #5, MUL VL]\n"
+      "st1h { z1.h }, p6, [x19, #6, MUL VL]\n"
+      "st1h { z0.h }, p6, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #12\n"
+      "st1h { z8.h }, p6, [x19, #-4, MUL VL]\n"
+      "st1h { z14.h }, p6, [x19, #-3, MUL VL]\n"
+      "st1h { z29.h }, p6, [x19, #-2, MUL VL]\n"
+      "st1h { z18.h }, p6, [x19, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x24, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x19, x24\n"
+      "whilelt p5.s, XZR, x19\n"
+      "ld1w { z22.s }, p5/Z, [x27]\n"
+      "ld1w { z21.s }, p5/Z, [x25]\n"
+      "decw x19\n"
+      "whilelt p4.s, XZR, x19\n"
+      "ld1w { z20.s }, p4/Z, [x27, #1, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x25, #1, MUL VL]\n"
+      "decw x19\n"
+      "whilelt p3.s, XZR, x19\n"
+      "ld1w { z18.s }, p3/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z17.s }, p3/Z, [x25, #2, MUL VL]\n"
+      "decw x19\n"
+      "whilelt p2.s, XZR, x19\n"
+      "ld1w { z30.s }, p2/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z16.s }, p2/Z, [x25, #3, MUL VL]\n"
+      "decw x19\n"
+      "whilelt p1.s, XZR, x19\n"
+      "ld1w { z13.s }, p1/Z, [x27, #4, MUL VL]\n"
+      "ld1w { z29.s }, p5/Z, [x26]\n"
+      "decw x19\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z12.s }, p0/Z, [x27, #5, MUL VL]\n"
+      "ld1w { z28.s }, p4/Z, [x26, #1, MUL VL]\n"
+      "ld1w { z11.s }, p3/Z, [x26, #2, MUL VL]\n"
+      "ld1w { z10.s }, p2/Z, [x26, #3, MUL VL]\n"
+      "zip1 z27.s, z22.s, z21.s\n"
+      "zip2 z26.s, z22.s, z21.s\n"
+      "ld1w { z9.s }, p1/Z, [x25, #4, MUL VL]\n"
+      "ld1w { z8.s }, p0/Z, [x25, #5, MUL VL]\n"
+      "zip1 z25.s, z20.s, z19.s\n"
+      "zip2 z24.s, z20.s, z19.s\n"
+      "ld1w { z23.s }, p5/Z, [x22]\n"
+      "ld1w { z22.s }, p4/Z, [x22, #1, MUL VL]\n"
+      "zip1 z21.s, z18.s, z17.s\n"
+      "zip2 z20.s, z18.s, z17.s\n"
+      "ld1w { z19.s }, p3/Z, [x22, #2, MUL VL]\n"
+      "ld1w { z18.s }, p2/Z, [x22, #3, MUL VL]\n"
+      "zip1 z17.s, z30.s, z16.s\n"
+      "zip2 z16.s, z30.s, z16.s\n"
+      "ld1w { z7.s }, p1/Z, [x26, #4, MUL VL]\n"
+      "ld1w { z6.s }, p0/Z, [x26, #5, MUL VL]\n"
+      ".inst 0x658abb65  // bfcvt z5.h, p6/M, z27.s\n"
+      "zip1 z4.s, z29.s, z23.s\n"
+      "ld1w { z3.s }, p1/Z, [x22, #4, MUL VL]\n"
+      "ld1w { z2.s }, p0/Z, [x22, #5, MUL VL]\n"
+      ".inst 0x658abb41  // bfcvt z1.h, p6/M, z26.s\n"
+      "zip2 z0.s, z29.s, z23.s\n"
+      ".inst 0x658abb3f  // bfcvt z31.h, p6/M, z25.s\n"
+      "zip1 z30.s, z28.s, z22.s\n"
+      "mov x19, x21\n"
+      "decd x24, ALL, MUL #12\n"
+      ".inst 0x658abb1d  // bfcvt z29.h, p6/M, z24.s\n"
+      "zip2 z28.s, z28.s, z22.s\n"
+      "cmp x24, #0x0\n"
+      "addvl x27, x27, #6\n"
+      ".inst 0x658ababb  // bfcvt z27.h, p6/M, z21.s\n"
+      "zip1 z23.s, z11.s, z19.s\n"
+      "addvl x26, x26, #6\n"
+      "addvl x25, x25, #6\n"
+      ".inst 0x658aba9a  // bfcvt z26.h, p6/M, z20.s\n"
+      "zip2 z22.s, z11.s, z19.s\n"
+      "addvl x22, x22, #6\n"
+      "add x21, x21, %x[out_stride]\n"
+      ".inst 0x658aba39  // bfcvt z25.h, p6/M, z17.s\n"
+      "zip1 z21.s, z10.s, z18.s\n"
+      ".inst 0x658aba18  // bfcvt z24.h, p6/M, z16.s\n"
+      "zip2 z20.s, z10.s, z18.s\n"
+      "zip1 z19.s, z13.s, z9.s\n"
+      "zip2 z18.s, z13.s, z9.s\n"
+      "zip1 z17.s, z12.s, z8.s\n"
+      "zip2 z16.s, z12.s, z8.s\n"
+      ".inst 0x648ab885  // bfcvtnt z5.h, p6/M, z4.s\n"
+      ".inst 0x648ab801  // bfcvtnt z1.h, p6/M, z0.s\n"
+      "st1h { z5.h }, p6, [x19]\n"
+      ".inst 0x648abbdf  // bfcvtnt z31.h, p6/M, z30.s\n"
+      ".inst 0x648abb9d  // bfcvtnt z29.h, p6/M, z28.s\n"
+      "st1h { z1.h }, p6, [x19, #1, MUL VL]\n"
+      ".inst 0x648abafb  // bfcvtnt z27.h, p6/M, z23.s\n"
+      ".inst 0x648abada  // bfcvtnt z26.h, p6/M, z22.s\n"
+      "st1h { z31.h }, p6, [x19, #2, MUL VL]\n"
+      ".inst 0x648abab9  // bfcvtnt z25.h, p6/M, z21.s\n"
+      ".inst 0x648aba98  // bfcvtnt z24.h, p6/M, z20.s\n"
+      "st1h { z29.h }, p6, [x19, #3, MUL VL]\n"
+      ".inst 0x658aba77  // bfcvt z23.h, p6/M, z19.s\n"
+      "zip1 z22.s, z7.s, z3.s\n"
+      "st1h { z27.h }, p6, [x19, #4, MUL VL]\n"
+      ".inst 0x658aba55  // bfcvt z21.h, p6/M, z18.s\n"
+      "zip2 z20.s, z7.s, z3.s\n"
+      "st1h { z26.h }, p6, [x19, #5, MUL VL]\n"
+      ".inst 0x658aba33  // bfcvt z19.h, p6/M, z17.s\n"
+      "zip1 z18.s, z6.s, z2.s\n"
+      "st1h { z25.h }, p6, [x19, #6, MUL VL]\n"
+      ".inst 0x658aba11  // bfcvt z17.h, p6/M, z16.s\n"
+      "zip2 z16.s, z6.s, z2.s\n"
+      "st1h { z24.h }, p6, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #12\n"
+      ".inst 0x648abad7  // bfcvtnt z23.h, p6/M, z22.s\n"
+      ".inst 0x648aba95  // bfcvtnt z21.h, p6/M, z20.s\n"
+      "st1h { z23.h }, p6, [x19, #-4, MUL VL]\n"
+      ".inst 0x648aba53  // bfcvtnt z19.h, p6/M, z18.s\n"
+      ".inst 0x648aba11  // bfcvtnt z17.h, p6/M, z16.s\n"
+      "st1h { z21.h }, p6, [x19, #-3, MUL VL]\n"
+      "st1h { z19.h }, p6, [x19, #-2, MUL VL]\n"
+      "st1h { z17.h }, p6, [x19, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #12\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<12, 4, true, VLType::SVE>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_12VL_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
new file mode 100644
index 0000000000..33694dfb0c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL.hpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_1VL(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 1 * height * get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "cmp %x[height], #0x4\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[out]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add %x[in], x21, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x20, %x[width]\n"
+      "cntw x19, ALL, MUL #2\n"
+      "cmp x20, x19\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z23.s }, p1/Z, [x25]\n"
+      "sub x20, x20, x19\n"
+      "ld1w { z22.s }, p1/Z, [x25, #1, MUL VL]\n"
+      "addvl x25, x25, #2\n"
+      "ld1w { z21.s }, p1/Z, [x23]\n"
+      "cmp x20, x19\n"
+      "ld1w { z20.s }, p1/Z, [x23, #1, MUL VL]\n"
+      "addvl x23, x23, #2\n"
+      "ld1w { z19.s }, p1/Z, [x22]\n"
+      "ld1w { z18.s }, p1/Z, [x22, #1, MUL VL]\n"
+      "addvl x22, x22, #2\n"
+      "ld1w { z17.s }, p1/Z, [x21]\n"
+      "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+      "addvl x21, x21, #2\n"
+      "st1w { z23.s }, p1, [x24]\n"
+      "st1w { z21.s }, p1, [x24, #1, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z17.s }, p1, [x24, #3, MUL VL]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "st1w { z22.s }, p1, [x24]\n"
+      "st1w { z20.s }, p1, [x24, #1, MUL VL]\n"
+      "st1w { z18.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z16.s }, p1, [x24, #3, MUL VL]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x20, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z19.s }, p0/Z, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "ld1w { z18.s }, p0/Z, [x23]\n"
+      "addvl x23, x23, #1\n"
+      "ld1w { z17.s }, p0/Z, [x22]\n"
+      "addvl x22, x22, #1\n"
+      "ld1w { z16.s }, p0/Z, [x21]\n"
+      "addvl x21, x21, #1\n"
+      "st1w { z19.s }, p1, [x24]\n"
+      "decw x20\n"
+      "st1w { z18.s }, p1, [x24, #1, MUL VL]\n"
+      "cmp x20, #0x0\n"
+      "st1w { z17.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z16.s }, p1, [x24, #3, MUL VL]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #4\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+
+      "7:"  // Tail row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[out]\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x20, %x[width]\n"
+      "cntw x19, ALL, MUL #2\n"
+      "cmp x20, x19\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1w { z17.s }, p1/Z, [x25]\n"
+      "sub x20, x20, x19\n"
+      "ld1w { z16.s }, p1/Z, [x25, #1, MUL VL]\n"
+      "addvl x25, x25, #2\n"
+      "cmp x20, x19\n"
+      "st1w { z17.s }, p1, [x24]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "st1w { z16.s }, p1, [x24]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x20, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z16.s }, p0/Z, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "decw x20\n"
+      "st1w { z16.s }, p1, [x24]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "cmp x20, #0x0\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "addvl %x[out], %x[out], #1\n"
+      "cmp %x[height], #0x1\n"
+      "bge 7b\n"
+      "12:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<1, 1, true, VLType::SVE>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_1VL(
+        reinterpret_cast<uint32_t *>(out),
+        reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 4,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
new file mode 100644
index 0000000000..e4fb7ea4c1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_1VL_1x4.hpp
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_1VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 1 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "cmp %x[height], #0x8\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x28, %x[out]\n"
+      "add x27, x9, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add %x[in], x21, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x20, %x[width]\n"
+      "cntb x19, ALL, MUL #2\n"
+      "cmp x20, x19\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z17.b }, p1/Z, [x9]\n"
+      "sub x20, x20, x19\n"
+      "ld1b { z3.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "addvl x9, x9, #2\n"
+      "ld1b { z21.b }, p1/Z, [x27]\n"
+      "cmp x20, x19\n"
+      "ld1b { z2.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "addvl x27, x27, #2\n"
+      "ld1b { z16.b }, p1/Z, [x26]\n"
+      "zip1 z20.b, z17.b, z16.b\n"
+      "ld1b { z1.b }, p1/Z, [x26, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "zip2 z19.b, z17.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x25]\n"
+      "ld1b { z0.b }, p1/Z, [x25, #1, MUL VL]\n"
+      "zip1 z31.b, z3.b, z1.b\n"
+      "ld1b { z30.b }, p1/Z, [x24]\n"
+      "addvl x25, x25, #2\n"
+      "zip1 z16.b, z21.b, z17.b\n"
+      "ld1b { z29.b }, p1/Z, [x24, #1, MUL VL]\n"
+      "addvl x24, x24, #2\n"
+      "zip1 z18.b, z20.b, z16.b\n"
+      "ld1b { z28.b }, p1/Z, [x23]\n"
+      "zip2 z27.b, z20.b, z16.b\n"
+      "ld1b { z26.b }, p1/Z, [x23, #1, MUL VL]\n"
+      "addvl x23, x23, #2\n"
+      "zip2 z17.b, z21.b, z17.b\n"
+      "ld1b { z16.b }, p1/Z, [x22]\n"
+      "zip1 z25.b, z2.b, z0.b\n"
+      "ld1b { z24.b }, p1/Z, [x22, #1, MUL VL]\n"
+      "addvl x22, x22, #2\n"
+      "zip1 z23.b, z19.b, z17.b\n"
+      "ld1b { z22.b }, p1/Z, [x21]\n"
+      "zip2 z20.b, z19.b, z17.b\n"
+      "ld1b { z21.b }, p1/Z, [x21, #1, MUL VL]\n"
+      "addvl x21, x21, #2\n"
+      "zip1 z19.b, z30.b, z16.b\n"
+      "st1b { z18.b }, p1, [x28]\n"
+      "zip2 z18.b, z30.b, z16.b\n"
+      "zip1 z17.b, z28.b, z22.b\n"
+      "zip1 z16.b, z19.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip2 z16.b, z19.b, z17.b\n"
+      "st1b { z27.b }, p1, [x28]\n"
+      "zip2 z17.b, z28.b, z22.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z23.b }, p1, [x28]\n"
+      "zip2 z17.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip1 z16.b, z31.b, z25.b\n"
+      "st1b { z20.b }, p1, [x28]\n"
+      "zip1 z19.b, z29.b, z24.b\n"
+      "st1b { z17.b }, p1, [x28, #1, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip1 z18.b, z26.b, z21.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z17.b, z31.b, z25.b\n"
+      "zip1 z16.b, z19.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip2 z16.b, z19.b, z18.b\n"
+      "st1b { z17.b }, p1, [x28]\n"
+      "zip2 z20.b, z3.b, z1.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip2 z19.b, z2.b, z0.b\n"
+      "zip2 z18.b, z29.b, z24.b\n"
+      "zip1 z16.b, z20.b, z19.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z17.b, z26.b, z21.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip2 z16.b, z20.b, z19.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x20, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.b, XZR, x20\n"
+      "ld1b { z18.b }, p0/Z, [x9]\n"
+      "incd x9, ALL, MUL #2\n"
+      "ld1b { z17.b }, p0/Z, [x27]\n"
+      "incd x27, ALL, MUL #2\n"
+      "ld1b { z16.b }, p0/Z, [x26]\n"
+      "zip1 z18.b, z18.b, z16.b\n"
+      "ld1b { z16.b }, p0/Z, [x25]\n"
+      "incd x26, ALL, MUL #2\n"
+      "zip1 z16.b, z17.b, z16.b\n"
+      "ld1b { z17.b }, p0/Z, [x24]\n"
+      "incd x25, ALL, MUL #2\n"
+      "zip1 z19.b, z18.b, z16.b\n"
+      "ld1b { z18.b }, p0/Z, [x23]\n"
+      "incd x24, ALL, MUL #2\n"
+      "ld1b { z16.b }, p0/Z, [x22]\n"
+      "zip1 z17.b, z17.b, z16.b\n"
+      "ld1b { z16.b }, p0/Z, [x21]\n"
+      "incd x23, ALL, MUL #2\n"
+      "zip1 z16.b, z18.b, z16.b\n"
+      "st1b { z19.b }, p1, [x28]\n"
+      "incd x22, ALL, MUL #2\n"
+      "zip1 z16.b, z17.b, z16.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "incd x21, ALL, MUL #2\n"
+      "add x28, x28, %x[out_stride]\n"
+      "decw x20\n"
+      "cmp x20, #0x0\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #2\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+
+      "7:"  // Tail row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x28, %x[out]\n"
+      "add x27, x9, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "csel x26, x26, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x27, x27, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x20, %x[width]\n"
+      "cntb x19, ALL, MUL #2\n"
+      "cmp x20, x19\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1b { z19.b }, p1/Z, [x9]\n"
+      "sub x20, x20, x19\n"
+      "ld1b { z18.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "addvl x9, x9, #2\n"
+      "ld1b { z25.b }, p1/Z, [x27]\n"
+      "cmp x20, x19\n"
+      "ld1b { z24.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "addvl x27, x27, #2\n"
+      "ld1b { z17.b }, p1/Z, [x26]\n"
+      "zip1 z23.b, z19.b, z17.b\n"
+      "ld1b { z16.b }, p1/Z, [x26, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "zip2 z22.b, z19.b, z17.b\n"
+      "ld1b { z21.b }, p1/Z, [x25]\n"
+      "ld1b { z20.b }, p1/Z, [x25, #1, MUL VL]\n"
+      "zip1 z19.b, z18.b, z16.b\n"
+      "addvl x25, x25, #2\n"
+      "zip2 z18.b, z18.b, z16.b\n"
+      "zip1 z17.b, z25.b, z21.b\n"
+      "zip1 z16.b, z23.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip2 z16.b, z23.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z17.b, z25.b, z21.b\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip1 z16.b, z22.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z22.b, z17.b\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip1 z17.b, z24.b, z20.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip1 z16.b, z19.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z19.b, z17.b\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip2 z17.b, z24.b, z20.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "add x28, x28, %x[out_stride]\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x20, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "whilelt p0.b, XZR, x20\n"
+      "ld1b { z17.b }, p0/Z, [x9]\n"
+      "incd x9, ALL, MUL #2\n"
+      "ld1b { z18.b }, p0/Z, [x27]\n"
+      "incd x27, ALL, MUL #2\n"
+      "ld1b { z16.b }, p0/Z, [x26]\n"
+      "zip1 z17.b, z17.b, z16.b\n"
+      "ld1b { z16.b }, p0/Z, [x25]\n"
+      "incd x26, ALL, MUL #2\n"
+      "zip1 z16.b, z18.b, z16.b\n"
+      "incd x25, ALL, MUL #2\n"
+      "decw x20\n"
+      "zip1 z16.b, z17.b, z16.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "cmp x20, #0x0\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "addvl %x[out], %x[out], #1\n"
+      "cmp %x[height], #0x1\n"
+      "bge 7b\n"
+      "12:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<1, 4, true, VLType::SVE>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_1VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<1, 4, true, VLType::SVE>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_1VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
new file mode 100644
index 0000000000..0d694f3ec0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL.hpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_3VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 3 * height * get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "cmp %x[height], #0x4\n"
+      "blt 4f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "mov x25, %x[out]\n"
+      "add x24, x26, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add %x[in], x22, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x21, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x20, x21\n"
+      "mov x19, x25\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z27.h }, p0/Z, [x26]\n"
+      "ld1h { z26.h }, p0/Z, [x24]\n"
+      "dech x20\n"
+      "ld1h { z25.h }, p0/Z, [x23]\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z24.h }, p0/Z, [x22]\n"
+      "dech x20\n"
+      "ld1h { z23.h }, p1/Z, [x26, #1, MUL VL]\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z22.h }, p1/Z, [x24, #1, MUL VL]\n"
+      "add x25, x25, %x[out_stride]\n"
+      "ld1h { z21.h }, p0/Z, [x26, #2, MUL VL]\n"
+      "addvl x26, x26, #3\n"
+      "ld1h { z20.h }, p0/Z, [x24, #2, MUL VL]\n"
+      "addvl x24, x24, #3\n"
+      "ld1h { z19.h }, p1/Z, [x23, #1, MUL VL]\n"
+      "dech x21, ALL, MUL #3\n"
+      "ld1h { z18.h }, p0/Z, [x23, #2, MUL VL]\n"
+      "addvl x23, x23, #3\n"
+      "ld1h { z17.h }, p1/Z, [x22, #1, MUL VL]\n"
+      "cmp x21, #0x0\n"
+      "ld1h { z16.h }, p0/Z, [x22, #2, MUL VL]\n"
+      "addvl x22, x22, #3\n"
+      "st1h { z27.h }, p2, [x19]\n"
+      "st1h { z23.h }, p2, [x19, #1, MUL VL]\n"
+      "st1h { z21.h }, p2, [x19, #2, MUL VL]\n"
+      "st1h { z26.h }, p2, [x19, #3, MUL VL]\n"
+      "st1h { z22.h }, p2, [x19, #4, MUL VL]\n"
+      "st1h { z20.h }, p2, [x19, #5, MUL VL]\n"
+      "st1h { z25.h }, p2, [x19, #6, MUL VL]\n"
+      "st1h { z19.h }, p2, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #12\n"
+      "st1h { z18.h }, p2, [x19, #-4, MUL VL]\n"
+      "st1h { z24.h }, p2, [x19, #-3, MUL VL]\n"
+      "st1h { z17.h }, p2, [x19, #-2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #12\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 8f\n"
+      "4:"  // Main loop skip
+
+      "5:"  // Tail row loop: Head
+      "mov x26, %x[in]\n"
+      "mov x25, %x[out]\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x20, %x[width]\n"
+      "6:"  // Tail row loop: Column loop
+      "mov x19, x20\n"
+      "dech x20, ALL, MUL #3\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z18.h }, p0/Z, [x26]\n"
+      "dech x19\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z17.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "dech x19\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z16.h }, p0/Z, [x26, #2, MUL VL]\n"
+      "st1h { z18.h }, p2, [x25]\n"
+      "addvl x26, x26, #3\n"
+      "st1h { z17.h }, p2, [x25, #1, MUL VL]\n"
+      "cmp x20, #0x0\n"
+      "st1h { z16.h }, p2, [x25, #2, MUL VL]\n"
+      "add x25, x25, %x[out_stride]\n"
+      "bgt 6b\n"
+      "7:"  // Tail row loop: Column loop skip
+      "addvl %x[out], %x[out], #3\n"
+      "cmp %x[height], #0x1\n"
+      "bge 5b\n"
+      "8:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<3, 1, true, VLType::SVE>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_3VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<3, 1, true, VLType::SVE>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_3VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<3, 1, true, VLType::SVE>(
+    double *out, const double *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_3VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(double) / 2,
+        stride * sizeof(double),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
new file mode 100644
index 0000000000..15b32c804f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_1x4.hpp
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_3VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 3 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "cmp %x[height], #0x8\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x28, %x[out]\n"
+      "add x27, x9, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add %x[in], x21, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x20, %x[width]\n"
+      "cntb x19, ALL, MUL #3\n"
+      "cmp x20, x19\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z18.b }, p1/Z, [x9]\n"
+      "sub x20, x20, x19\n"
+      "ld1b { z19.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "cmp x20, x19\n"
+      "ld1b { z10.b }, p1/Z, [x9, #2, MUL VL]\n"
+      "addvl x9, x9, #3\n"
+      "ld1b { z24.b }, p1/Z, [x27]\n"
+      "ld1b { z23.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "ld1b { z9.b }, p1/Z, [x27, #2, MUL VL]\n"
+      "addvl x27, x27, #3\n"
+      "ld1b { z16.b }, p1/Z, [x26]\n"
+      "zip1 z21.b, z18.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x26, #1, MUL VL]\n"
+      "zip2 z18.b, z18.b, z16.b\n"
+      "ld1b { z8.b }, p1/Z, [x26, #2, MUL VL]\n"
+      "addvl x26, x26, #3\n"
+      "zip1 z22.b, z19.b, z17.b\n"
+      "ld1b { z16.b }, p1/Z, [x25]\n"
+      "zip2 z7.b, z19.b, z17.b\n"
+      "ld1b { z20.b }, p1/Z, [x25, #1, MUL VL]\n"
+      "zip1 z6.b, z10.b, z8.b\n"
+      "ld1b { z5.b }, p1/Z, [x25, #2, MUL VL]\n"
+      "addvl x25, x25, #3\n"
+      "zip1 z17.b, z24.b, z16.b\n"
+      "ld1b { z19.b }, p1/Z, [x24]\n"
+      "zip2 z16.b, z24.b, z16.b\n"
+      "ld1b { z4.b }, p1/Z, [x24, #1, MUL VL]\n"
+      "zip1 z3.b, z21.b, z17.b\n"
+      "ld1b { z2.b }, p1/Z, [x24, #2, MUL VL]\n"
+      "addvl x24, x24, #3\n"
+      "zip2 z1.b, z21.b, z17.b\n"
+      "ld1b { z0.b }, p1/Z, [x23]\n"
+      "zip1 z31.b, z18.b, z16.b\n"
+      "ld1b { z30.b }, p1/Z, [x23, #1, MUL VL]\n"
+      "zip2 z29.b, z18.b, z16.b\n"
+      "ld1b { z28.b }, p1/Z, [x23, #2, MUL VL]\n"
+      "addvl x23, x23, #3\n"
+      "zip1 z18.b, z23.b, z20.b\n"
+      "ld1b { z17.b }, p1/Z, [x22]\n"
+      "zip2 z27.b, z23.b, z20.b\n"
+      "ld1b { z26.b }, p1/Z, [x22, #1, MUL VL]\n"
+      "zip1 z25.b, z22.b, z18.b\n"
+      "ld1b { z24.b }, p1/Z, [x22, #2, MUL VL]\n"
+      "addvl x22, x22, #3\n"
+      "zip1 z21.b, z19.b, z17.b\n"
+      "ld1b { z16.b }, p1/Z, [x21]\n"
+      "zip2 z19.b, z19.b, z17.b\n"
+      "ld1b { z23.b }, p1/Z, [x21, #1, MUL VL]\n"
+      "zip2 z20.b, z22.b, z18.b\n"
+      "ld1b { z22.b }, p1/Z, [x21, #2, MUL VL]\n"
+      "addvl x21, x21, #3\n"
+      "zip1 z17.b, z0.b, z16.b\n"
+      "st1b { z3.b }, p1, [x28]\n"
+      "zip2 z18.b, z0.b, z16.b\n"
+      "st1b { z1.b }, p1, [x28, #1, MUL VL]\n"
+      "zip1 z16.b, z21.b, z17.b\n"
+      "st1b { z31.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z17.b, z21.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip1 z16.b, z19.b, z18.b\n"
+      "st1b { z17.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z19.b, z19.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip1 z18.b, z4.b, z26.b\n"
+      "st1b { z29.b }, p1, [x28]\n"
+      "zip1 z17.b, z30.b, z23.b\n"
+      "st1b { z25.b }, p1, [x28, #1, MUL VL]\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z20.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z18.b, z18.b, z17.b\n"
+      "st1b { z19.b }, p1, [x28, #3, MUL VL]\n"
+      "zip1 z17.b, z7.b, z27.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z16.b, z7.b, z27.b\n"
+      "st1b { z18.b }, p1, [x28, #5, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip1 z21.b, z9.b, z5.b\n"
+      "st1b { z17.b }, p1, [x28]\n"
+      "zip2 z18.b, z4.b, z26.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip1 z16.b, z6.b, z21.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z17.b, z30.b, z23.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip1 z20.b, z2.b, z24.b\n"
+      "zip1 z19.b, z28.b, z22.b\n"
+      "zip1 z16.b, z20.b, z19.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip2 z16.b, z6.b, z21.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z18.b, z10.b, z8.b\n"
+      "zip2 z17.b, z9.b, z5.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z16.b, z20.b, z19.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip2 z18.b, z2.b, z24.b\n"
+      "zip2 z17.b, z28.b, z22.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x20, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.b, XZR, x20\n"
+      "ld1b { z18.b }, p0/Z, [x9]\n"
+      "incd x9, ALL, MUL #6\n"
+      "ld1b { z25.b }, p0/Z, [x27]\n"
+      "incd x27, ALL, MUL #6\n"
+      "ld1b { z16.b }, p0/Z, [x26]\n"
+      "zip1 z17.b, z18.b, z16.b\n"
+      "ld1b { z24.b }, p0/Z, [x25]\n"
+      "incd x26, ALL, MUL #6\n"
+      "zip2 z19.b, z18.b, z16.b\n"
+      "ld1b { z23.b }, p0/Z, [x24]\n"
+      "incd x25, ALL, MUL #6\n"
+      "zip1 z16.b, z25.b, z24.b\n"
+      "ld1b { z22.b }, p0/Z, [x23]\n"
+      "incd x24, ALL, MUL #6\n"
+      "zip1 z18.b, z17.b, z16.b\n"
+      "ld1b { z21.b }, p0/Z, [x22]\n"
+      "incd x23, ALL, MUL #6\n"
+      "zip2 z17.b, z17.b, z16.b\n"
+      "ld1b { z20.b }, p0/Z, [x21]\n"
+      "incd x22, ALL, MUL #6\n"
+      "zip2 z16.b, z25.b, z24.b\n"
+      "st1b { z18.b }, p1, [x28]\n"
+      "incd x21, ALL, MUL #6\n"
+      "zip1 z16.b, z19.b, z16.b\n"
+      "st1b { z17.b }, p1, [x28, #1, MUL VL]\n"
+      "decw x20, ALL, MUL #3\n"
+      "zip1 z19.b, z23.b, z21.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "cmp x20, #0x0\n"
+      "zip1 z18.b, z22.b, z20.b\n"
+      "zip2 z17.b, z23.b, z21.b\n"
+      "zip1 z16.b, z19.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip2 z16.b, z19.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z16.b, z22.b, z20.b\n"
+      "zip1 z16.b, z17.b, z16.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #6\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+
+      "7:"  // Tail row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x28, %x[out]\n"
+      "add x27, x9, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "csel x26, x26, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x27, x27, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x20, %x[width]\n"
+      "cntb x19, ALL, MUL #3\n"
+      "cmp x20, x19\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1b { z19.b }, p1/Z, [x9]\n"
+      "sub x20, x20, x19\n"
+      "ld1b { z18.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "cmp x20, x19\n"
+      "ld1b { z30.b }, p1/Z, [x9, #2, MUL VL]\n"
+      "addvl x9, x9, #3\n"
+      "ld1b { z29.b }, p1/Z, [x27]\n"
+      "ld1b { z28.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "ld1b { z27.b }, p1/Z, [x27, #2, MUL VL]\n"
+      "addvl x27, x27, #3\n"
+      "ld1b { z16.b }, p1/Z, [x26]\n"
+      "zip1 z26.b, z19.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x26, #1, MUL VL]\n"
+      "zip2 z25.b, z19.b, z16.b\n"
+      "ld1b { z24.b }, p1/Z, [x26, #2, MUL VL]\n"
+      "addvl x26, x26, #3\n"
+      "zip1 z23.b, z18.b, z17.b\n"
+      "ld1b { z16.b }, p1/Z, [x25]\n"
+      "zip2 z22.b, z18.b, z17.b\n"
+      "ld1b { z21.b }, p1/Z, [x25, #1, MUL VL]\n"
+      "zip1 z20.b, z30.b, z24.b\n"
+      "ld1b { z19.b }, p1/Z, [x25, #2, MUL VL]\n"
+      "addvl x25, x25, #3\n"
+      "zip1 z18.b, z29.b, z16.b\n"
+      "zip2 z17.b, z29.b, z16.b\n"
+      "zip1 z16.b, z26.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z26.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip1 z16.b, z25.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip2 z16.b, z25.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip1 z18.b, z28.b, z21.b\n"
+      "zip2 z17.b, z28.b, z21.b\n"
+      "zip1 z16.b, z23.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip2 z16.b, z23.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip1 z16.b, z22.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z22.b, z17.b\n"
+      "zip1 z17.b, z27.b, z19.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip1 z16.b, z20.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip2 z16.b, z20.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z18.b, z30.b, z24.b\n"
+      "zip2 z17.b, z27.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x20, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "whilelt p0.b, XZR, x20\n"
+      "ld1b { z18.b }, p0/Z, [x9]\n"
+      "incd x9, ALL, MUL #6\n"
+      "ld1b { z21.b }, p0/Z, [x27]\n"
+      "incd x27, ALL, MUL #6\n"
+      "ld1b { z17.b }, p0/Z, [x26]\n"
+      "zip1 z20.b, z18.b, z17.b\n"
+      "ld1b { z16.b }, p0/Z, [x25]\n"
+      "incd x26, ALL, MUL #6\n"
+      "zip2 z19.b, z18.b, z17.b\n"
+      "incd x25, ALL, MUL #6\n"
+      "decw x20, ALL, MUL #3\n"
+      "zip1 z18.b, z21.b, z16.b\n"
+      "cmp x20, #0x0\n"
+      "zip2 z17.b, z21.b, z16.b\n"
+      "zip1 z16.b, z20.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z20.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip1 z16.b, z19.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "addvl %x[out], %x[out], #3\n"
+      "cmp %x[height], #0x1\n"
+      "bge 7b\n"
+      "12:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<3, 4, true, VLType::SVE>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_3VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<3, 4, true, VLType::SVE>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_3VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
new file mode 100644
index 0000000000..1864a16758
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_3VL_2x2.hpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_3VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 3 * roundup<size_t>(height, 2) * get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "cmp %x[height], #0x8\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x11, %x[in]\n"
+      "mov x10, %x[out]\n"
+      "add x9, x11, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x22, %x[width]\n"
+      "cnth x21, ALL, MUL #3\n"
+      "cmp x22, x21\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1h { z19.h }, p2/Z, [x11]\n"
+      "mov x20, x10\n"
+      "ld1h { z18.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "ld1h { z21.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "addvl x11, x11, #3\n"
+      "ld1h { z16.h }, p2/Z, [x9]\n"
+      "zip1 z9.h, z19.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "mov x19, x10\n"
+      "zip2 z8.h, z19.h, z16.h\n"
+      "ld1h { z16.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "addvl x9, x9, #3\n"
+      "zip1 z7.h, z18.h, z17.h\n"
+      "ld1h { z19.h }, p2/Z, [x28]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "zip2 z6.h, z18.h, z17.h\n"
+      "ld1h { z20.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "sub x22, x22, x21\n"
+      "zip1 z5.h, z21.h, z16.h\n"
+      "ld1h { z18.h }, p2/Z, [x28, #2, MUL VL]\n"
+      "addvl x28, x28, #3\n"
+      "zip2 z4.h, z21.h, z16.h\n"
+      "ld1h { z16.h }, p2/Z, [x27]\n"
+      "cmp x22, x21\n"
+      "zip1 z3.h, z19.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x27, #1, MUL VL]\n"
+      "zip2 z2.h, z19.h, z16.h\n"
+      "ld1h { z16.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "addvl x27, x27, #3\n"
+      "zip1 z1.h, z20.h, z17.h\n"
+      "ld1h { z19.h }, p2/Z, [x26]\n"
+      "zip2 z0.h, z20.h, z17.h\n"
+      "ld1h { z21.h }, p2/Z, [x26, #1, MUL VL]\n"
+      "zip1 z31.h, z18.h, z16.h\n"
+      "ld1h { z20.h }, p2/Z, [x26, #2, MUL VL]\n"
+      "addvl x26, x26, #3\n"
+      "zip2 z30.h, z18.h, z16.h\n"
+      "ld1h { z18.h }, p2/Z, [x25]\n"
+      "ld1h { z17.h }, p2/Z, [x25, #1, MUL VL]\n"
+      "zip1 z29.h, z19.h, z18.h\n"
+      "ld1h { z16.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "addvl x25, x25, #3\n"
+      "zip2 z28.h, z19.h, z18.h\n"
+      "ld1h { z19.h }, p2/Z, [x24]\n"
+      "zip1 z27.h, z21.h, z17.h\n"
+      "ld1h { z26.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "zip2 z25.h, z21.h, z17.h\n"
+      "ld1h { z24.h }, p2/Z, [x24, #2, MUL VL]\n"
+      "addvl x24, x24, #3\n"
+      "zip1 z23.h, z20.h, z16.h\n"
+      "ld1h { z18.h }, p2/Z, [x23]\n"
+      "zip2 z22.h, z20.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z16.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z21.h, z19.h, z18.h\n"
+      "st1h { z9.h }, p2, [x20]\n"
+      "addvl x23, x23, #3\n"
+      "zip2 z20.h, z19.h, z18.h\n"
+      "st1h { z8.h }, p2, [x20, #1, MUL VL]\n"
+      "zip1 z19.h, z26.h, z17.h\n"
+      "st1h { z7.h }, p2, [x20, #2, MUL VL]\n"
+      "zip2 z18.h, z26.h, z17.h\n"
+      "st1h { z3.h }, p2, [x20, #3, MUL VL]\n"
+      "zip1 z17.h, z24.h, z16.h\n"
+      "st1h { z2.h }, p2, [x20, #4, MUL VL]\n"
+      "zip2 z16.h, z24.h, z16.h\n"
+      "st1h { z1.h }, p2, [x20, #5, MUL VL]\n"
+      "st1h { z29.h }, p2, [x20, #6, MUL VL]\n"
+      "st1h { z28.h }, p2, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #12\n"
+      "st1h { z27.h }, p2, [x20, #-4, MUL VL]\n"
+      "st1h { z21.h }, p2, [x20, #-3, MUL VL]\n"
+      "st1h { z20.h }, p2, [x20, #-2, MUL VL]\n"
+      "st1h { z19.h }, p2, [x20, #-1, MUL VL]\n"
+      "st1h { z6.h }, p2, [x19]\n"
+      "st1h { z5.h }, p2, [x19, #1, MUL VL]\n"
+      "st1h { z4.h }, p2, [x19, #2, MUL VL]\n"
+      "st1h { z0.h }, p2, [x19, #3, MUL VL]\n"
+      "st1h { z31.h }, p2, [x19, #4, MUL VL]\n"
+      "st1h { z30.h }, p2, [x19, #5, MUL VL]\n"
+      "st1h { z25.h }, p2, [x19, #6, MUL VL]\n"
+      "st1h { z23.h }, p2, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #12\n"
+      "st1h { z22.h }, p2, [x19, #-4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x19, #-3, MUL VL]\n"
+      "st1h { z17.h }, p2, [x19, #-2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x22, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x22\n"
+      "mov x19, x10\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z17.h }, p1/Z, [x11]\n"
+      "ld1h { z16.h }, p1/Z, [x9]\n"
+      "zip1 z29.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p1/Z, [x28]\n"
+      "dech x20\n"
+      "zip2 z28.h, z17.h, z16.h\n"
+      "ld1h { z16.h }, p1/Z, [x27]\n"
+      "whilelt p0.h, XZR, x20\n"
+      "zip1 z27.h, z18.h, z16.h\n"
+      "ld1h { z17.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "addvl x11, x11, #1\n"
+      "zip2 z26.h, z18.h, z16.h\n"
+      "ld1h { z16.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "incd x11, ALL, MUL #4\n"
+      "zip1 z25.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p0/Z, [x28, #1, MUL VL]\n"
+      "addvl x9, x9, #1\n"
+      "ld1h { z16.h }, p0/Z, [x27, #1, MUL VL]\n"
+      "zip1 z24.h, z17.h, z16.h\n"
+      "ld1h { z19.h }, p1/Z, [x26]\n"
+      "incd x9, ALL, MUL #4\n"
+      "ld1h { z18.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "addvl x28, x28, #1\n"
+      "ld1h { z17.h }, p1/Z, [x25]\n"
+      "zip1 z23.h, z19.h, z17.h\n"
+      "ld1h { z16.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "incd x28, ALL, MUL #4\n"
+      "zip2 z22.h, z19.h, z17.h\n"
+      "ld1h { z21.h }, p1/Z, [x24]\n"
+      "addvl x27, x27, #1\n"
+      "zip1 z20.h, z18.h, z16.h\n"
+      "ld1h { z19.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "incd x27, ALL, MUL #4\n"
+      "ld1h { z17.h }, p1/Z, [x23]\n"
+      "zip1 z18.h, z21.h, z17.h\n"
+      "ld1h { z16.h }, p0/Z, [x23, #1, MUL VL]\n"
+      "addvl x26, x26, #1\n"
+      "zip2 z17.h, z21.h, z17.h\n"
+      "st1h { z29.h }, p2, [x19]\n"
+      "incd x26, ALL, MUL #4\n"
+      "zip1 z16.h, z19.h, z16.h\n"
+      "st1h { z28.h }, p2, [x19, #1, MUL VL]\n"
+      "addvl x25, x25, #1\n"
+      "st1h { z25.h }, p2, [x19, #2, MUL VL]\n"
+      "incd x25, ALL, MUL #4\n"
+      "st1h { z27.h }, p2, [x19, #3, MUL VL]\n"
+      "addvl x24, x24, #1\n"
+      "st1h { z26.h }, p2, [x19, #4, MUL VL]\n"
+      "incd x24, ALL, MUL #4\n"
+      "st1h { z24.h }, p2, [x19, #5, MUL VL]\n"
+      "addvl x23, x23, #1\n"
+      "st1h { z23.h }, p2, [x19, #6, MUL VL]\n"
+      "incd x23, ALL, MUL #4\n"
+      "st1h { z22.h }, p2, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #12\n"
+      "add x10, x10, %x[out_stride]\n"
+      "st1h { z20.h }, p2, [x19, #-4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x19, #-3, MUL VL]\n"
+      "decw x22, ALL, MUL #3\n"
+      "st1h { z17.h }, p2, [x19, #-2, MUL VL]\n"
+      "cmp x22, #0x0\n"
+      "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #12\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+
+      "7:"  // Tail row loop: Head
+      "mov x11, %x[in]\n"
+      "mov x10, %x[out]\n"
+      "add x9, x11, %x[in_stride]\n"
+      "add %x[in], x9, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "csel x9, x9, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x20, %x[width]\n"
+      "cnth x19, ALL, MUL #3\n"
+      "cmp x20, x19\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1h { z17.h }, p2/Z, [x11]\n"
+      "sub x20, x20, x19\n"
+      "ld1h { z22.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "cmp x20, x19\n"
+      "ld1h { z21.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "addvl x11, x11, #3\n"
+      "ld1h { z16.h }, p2/Z, [x9]\n"
+      "zip1 z20.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "zip2 z17.h, z17.h, z16.h\n"
+      "ld1h { z19.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "addvl x9, x9, #3\n"
+      "zip1 z16.h, z22.h, z18.h\n"
+      "st1h { z20.h }, p2, [x10]\n"
+      "zip2 z18.h, z22.h, z18.h\n"
+      "st1h { z17.h }, p2, [x10, #1, MUL VL]\n"
+      "zip1 z17.h, z21.h, z19.h\n"
+      "st1h { z16.h }, p2, [x10, #2, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "zip2 z16.h, z21.h, z19.h\n"
+      "st1h { z18.h }, p2, [x10]\n"
+      "st1h { z17.h }, p2, [x10, #1, MUL VL]\n"
+      "st1h { z16.h }, p2, [x10, #2, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x20, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x19, x20\n"
+      "decw x20, ALL, MUL #3\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z17.h }, p0/Z, [x11]\n"
+      "ld1h { z16.h }, p0/Z, [x9]\n"
+      "zip1 z19.h, z17.h, z16.h\n"
+      "dech x19\n"
+      "zip2 z18.h, z17.h, z16.h\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z17.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "addvl x11, x11, #1\n"
+      "ld1h { z16.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "zip1 z16.h, z17.h, z16.h\n"
+      "st1h { z19.h }, p2, [x10]\n"
+      "incd x11, ALL, MUL #4\n"
+      "st1h { z18.h }, p2, [x10, #1, MUL VL]\n"
+      "addvl x9, x9, #1\n"
+      "st1h { z16.h }, p2, [x10, #2, MUL VL]\n"
+      "incd x9, ALL, MUL #4\n"
+      "add x10, x10, %x[out_stride]\n"
+      "cmp x20, #0x0\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "addvl %x[out], %x[out], #3\n"
+      "cmp %x[height], #0x1\n"
+      "bge 7b\n"
+      "12:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<3, 2, true, VLType::SVE>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_3VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
new file mode 100644
index 0000000000..aa9d7220fe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL.hpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_4VL(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 4 * height * get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      "ptrue p3.b\n"
+      "cmp %x[height], #0x4\n"
+      "blt 4f\n"
+      "1:"  // Main row loop: Head
+      "mov x26, %x[in]\n"
+      "mov x25, %x[out]\n"
+      "add x24, x26, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add %x[in], x22, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x21, %x[width]\n"
+      "2:"  // Main row loop: Column loop
+      "mov x20, x21\n"
+      "mov x19, x25\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z31.h }, p0/Z, [x26]\n"
+      "ld1h { z30.h }, p0/Z, [x24]\n"
+      "dech x20\n"
+      "ld1h { z29.h }, p0/Z, [x23]\n"
+      "whilelt p2.h, XZR, x20\n"
+      "ld1h { z28.h }, p0/Z, [x22]\n"
+      "dech x20\n"
+      "ld1h { z27.h }, p2/Z, [x26, #1, MUL VL]\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z26.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "dech x20\n"
+      "ld1h { z25.h }, p1/Z, [x26, #2, MUL VL]\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z24.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "add x25, x25, %x[out_stride]\n"
+      "ld1h { z23.h }, p0/Z, [x26, #3, MUL VL]\n"
+      "addvl x26, x26, #4\n"
+      "ld1h { z22.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      "ld1h { z21.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "dech x21, ALL, MUL #4\n"
+      "ld1h { z20.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "cmp x21, #0x0\n"
+      "ld1h { z19.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "ld1h { z18.h }, p2/Z, [x22, #1, MUL VL]\n"
+      "ld1h { z17.h }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1h { z16.h }, p0/Z, [x22, #3, MUL VL]\n"
+      "addvl x22, x22, #4\n"
+      "st1h { z31.h }, p3, [x19]\n"
+      "st1h { z27.h }, p3, [x19, #1, MUL VL]\n"
+      "st1h { z25.h }, p3, [x19, #2, MUL VL]\n"
+      "st1h { z23.h }, p3, [x19, #3, MUL VL]\n"
+      "st1h { z30.h }, p3, [x19, #4, MUL VL]\n"
+      "st1h { z26.h }, p3, [x19, #5, MUL VL]\n"
+      "st1h { z24.h }, p3, [x19, #6, MUL VL]\n"
+      "st1h { z22.h }, p3, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #16\n"
+      "st1h { z29.h }, p3, [x19, #-8, MUL VL]\n"
+      "st1h { z21.h }, p3, [x19, #-7, MUL VL]\n"
+      "st1h { z20.h }, p3, [x19, #-6, MUL VL]\n"
+      "st1h { z19.h }, p3, [x19, #-5, MUL VL]\n"
+      "st1h { z28.h }, p3, [x19, #-4, MUL VL]\n"
+      "st1h { z18.h }, p3, [x19, #-3, MUL VL]\n"
+      "st1h { z17.h }, p3, [x19, #-2, MUL VL]\n"
+      "st1h { z16.h }, p3, [x19, #-1, MUL VL]\n"
+      "bgt 2b\n"
+      "3:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #16\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 8f\n"
+      "4:"  // Main loop skip
+
+      "5:"  // Tail row loop: Head
+      "mov x26, %x[in]\n"
+      "mov x25, %x[out]\n"
+      "add %x[in], x26, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x20, %x[width]\n"
+      "6:"  // Tail row loop: Column loop
+      "mov x19, x20\n"
+      "dech x20, ALL, MUL #4\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z19.h }, p0/Z, [x26]\n"
+      "dech x19\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z18.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "dech x19\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z17.h }, p0/Z, [x26, #2, MUL VL]\n"
+      "dech x19\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z16.h }, p0/Z, [x26, #3, MUL VL]\n"
+      "st1h { z19.h }, p3, [x25]\n"
+      "addvl x26, x26, #4\n"
+      "st1h { z18.h }, p3, [x25, #1, MUL VL]\n"
+      "cmp x20, #0x0\n"
+      "st1h { z17.h }, p3, [x25, #2, MUL VL]\n"
+      "st1h { z16.h }, p3, [x25, #3, MUL VL]\n"
+      "add x25, x25, %x[out_stride]\n"
+      "bgt 6b\n"
+      "7:"  // Tail row loop: Column loop skip
+      "addvl %x[out], %x[out], #4\n"
+      "cmp %x[height], #0x1\n"
+      "bge 5b\n"
+      "8:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 1, true, VLType::SVE>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_4VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 2,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 1, true, VLType::SVE>(
+    __fp16 *out, const __fp16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_4VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(__fp16) / 2,
+        stride * sizeof(__fp16),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 1, true, VLType::SVE>(
+    double *out, const double *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_4VL(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(double) / 2,
+        stride * sizeof(double),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
new file mode 100644
index 0000000000..5e5f7a53a7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_1x4.hpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_4VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 4 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "cmp %x[height], #0x8\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x28, %x[out]\n"
+      "add x27, x9, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add %x[in], x21, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x20, %x[width]\n"
+      "cntb x19, ALL, MUL #2\n"
+      "cmp x20, x19\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z17.b }, p1/Z, [x9]\n"
+      "sub x20, x20, x19\n"
+      "ld1b { z3.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "addvl x9, x9, #2\n"
+      "ld1b { z20.b }, p1/Z, [x27]\n"
+      "cmp x20, x19\n"
+      "ld1b { z2.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "addvl x27, x27, #2\n"
+      "ld1b { z16.b }, p1/Z, [x26]\n"
+      "zip1 z18.b, z17.b, z16.b\n"
+      "ld1b { z1.b }, p1/Z, [x26, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "zip2 z19.b, z17.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x25]\n"
+      "ld1b { z0.b }, p1/Z, [x25, #1, MUL VL]\n"
+      "zip1 z31.b, z3.b, z1.b\n"
+      "ld1b { z30.b }, p1/Z, [x24]\n"
+      "addvl x25, x25, #2\n"
+      "zip1 z16.b, z20.b, z17.b\n"
+      "ld1b { z29.b }, p1/Z, [x24, #1, MUL VL]\n"
+      "addvl x24, x24, #2\n"
+      "zip1 z28.b, z18.b, z16.b\n"
+      "ld1b { z27.b }, p1/Z, [x23]\n"
+      "zip2 z26.b, z18.b, z16.b\n"
+      "ld1b { z25.b }, p1/Z, [x23, #1, MUL VL]\n"
+      "addvl x23, x23, #2\n"
+      "zip2 z18.b, z20.b, z17.b\n"
+      "ld1b { z16.b }, p1/Z, [x22]\n"
+      "zip1 z24.b, z2.b, z0.b\n"
+      "ld1b { z23.b }, p1/Z, [x22, #1, MUL VL]\n"
+      "addvl x22, x22, #2\n"
+      "zip1 z17.b, z19.b, z18.b\n"
+      "ld1b { z22.b }, p1/Z, [x21]\n"
+      "zip2 z21.b, z19.b, z18.b\n"
+      "ld1b { z20.b }, p1/Z, [x21, #1, MUL VL]\n"
+      "addvl x21, x21, #2\n"
+      "zip1 z19.b, z30.b, z16.b\n"
+      "st1b { z28.b }, p1, [x28]\n"
+      "zip2 z18.b, z30.b, z16.b\n"
+      "st1b { z26.b }, p1, [x28, #1, MUL VL]\n"
+      "zip1 z16.b, z27.b, z22.b\n"
+      "st1b { z17.b }, p1, [x28, #2, MUL VL]\n"
+      "zip1 z17.b, z19.b, z16.b\n"
+      "st1b { z21.b }, p1, [x28, #3, MUL VL]\n"
+      "zip2 z16.b, z19.b, z16.b\n"
+      "st1b { z17.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z17.b, z27.b, z22.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #6, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #7, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip1 z16.b, z31.b, z24.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z31.b, z24.b\n"
+      "zip2 z18.b, z3.b, z1.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip2 z17.b, z2.b, z0.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip1 z18.b, z29.b, z23.b\n"
+      "zip1 z17.b, z25.b, z20.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "zip2 z18.b, z29.b, z23.b\n"
+      "zip2 z17.b, z25.b, z20.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #6, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #7, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x20, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.b, XZR, x20\n"
+      "ld1b { z17.b }, p0/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "ld1b { z25.b }, p0/Z, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "ld1b { z16.b }, p0/Z, [x26]\n"
+      "zip1 z18.b, z17.b, z16.b\n"
+      "ld1b { z24.b }, p0/Z, [x25]\n"
+      "addvl x26, x26, #1\n"
+      "zip2 z23.b, z17.b, z16.b\n"
+      "ld1b { z22.b }, p0/Z, [x24]\n"
+      "addvl x25, x25, #1\n"
+      "zip1 z16.b, z25.b, z24.b\n"
+      "ld1b { z21.b }, p0/Z, [x23]\n"
+      "addvl x24, x24, #1\n"
+      "zip1 z17.b, z18.b, z16.b\n"
+      "ld1b { z20.b }, p0/Z, [x22]\n"
+      "addvl x23, x23, #1\n"
+      "zip2 z18.b, z18.b, z16.b\n"
+      "ld1b { z19.b }, p0/Z, [x21]\n"
+      "addvl x22, x22, #1\n"
+      "zip2 z16.b, z25.b, z24.b\n"
+      "st1b { z17.b }, p1, [x28]\n"
+      "addvl x21, x21, #1\n"
+      "zip1 z17.b, z23.b, z16.b\n"
+      "st1b { z18.b }, p1, [x28, #1, MUL VL]\n"
+      "decw x20, ALL, MUL #4\n"
+      "zip2 z16.b, z23.b, z16.b\n"
+      "st1b { z17.b }, p1, [x28, #2, MUL VL]\n"
+      "cmp x20, #0x0\n"
+      "zip1 z18.b, z22.b, z20.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip1 z17.b, z21.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "zip2 z18.b, z22.b, z20.b\n"
+      "zip2 z17.b, z21.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #6, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #7, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #8\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+
+      "7:"  // Tail row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x28, %x[out]\n"
+      "add x27, x9, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "csel x26, x26, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x27, x27, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x20, %x[width]\n"
+      "cntb x19, ALL, MUL #2\n"
+      "cmp x20, x19\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1b { z18.b }, p1/Z, [x9]\n"
+      "sub x20, x20, x19\n"
+      "ld1b { z19.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "addvl x9, x9, #2\n"
+      "ld1b { z25.b }, p1/Z, [x27]\n"
+      "cmp x20, x19\n"
+      "ld1b { z24.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "addvl x27, x27, #2\n"
+      "ld1b { z17.b }, p1/Z, [x26]\n"
+      "zip1 z23.b, z18.b, z17.b\n"
+      "ld1b { z16.b }, p1/Z, [x26, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "zip2 z22.b, z18.b, z17.b\n"
+      "ld1b { z18.b }, p1/Z, [x25]\n"
+      "ld1b { z21.b }, p1/Z, [x25, #1, MUL VL]\n"
+      "zip1 z20.b, z19.b, z16.b\n"
+      "addvl x25, x25, #2\n"
+      "zip2 z19.b, z19.b, z16.b\n"
+      "zip1 z17.b, z25.b, z18.b\n"
+      "zip1 z16.b, z23.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z23.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip2 z17.b, z25.b, z18.b\n"
+      "zip1 z16.b, z22.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z16.b, z22.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip1 z18.b, z24.b, z21.b\n"
+      "zip2 z17.b, z24.b, z21.b\n"
+      "zip1 z16.b, z20.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z20.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip1 z16.b, z19.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z16.b, z19.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x20, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "whilelt p0.b, XZR, x20\n"
+      "ld1b { z18.b }, p0/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "ld1b { z21.b }, p0/Z, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "ld1b { z17.b }, p0/Z, [x26]\n"
+      "zip1 z20.b, z18.b, z17.b\n"
+      "ld1b { z16.b }, p0/Z, [x25]\n"
+      "addvl x26, x26, #1\n"
+      "zip2 z19.b, z18.b, z17.b\n"
+      "addvl x25, x25, #1\n"
+      "decw x20, ALL, MUL #4\n"
+      "zip1 z18.b, z21.b, z16.b\n"
+      "cmp x20, #0x0\n"
+      "zip2 z17.b, z21.b, z16.b\n"
+      "zip1 z16.b, z20.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z20.b, z18.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip1 z16.b, z19.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z16.b, z19.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "addvl %x[out], %x[out], #4\n"
+      "cmp %x[height], #0x1\n"
+      "bge 7b\n"
+      "12:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 4, true, VLType::SVE>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_4VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<4, 4, true, VLType::SVE>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_4VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
new file mode 100644
index 0000000000..48040f9edb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_4VL_2x2.hpp
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_4VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 4 * roundup<size_t>(height, 2) * get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "cmp %x[height], #0x8\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x11, %x[in]\n"
+      "mov x10, %x[out]\n"
+      "add x9, x11, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x22, %x[width]\n"
+      "cnth x21, ALL, MUL #4\n"
+      "cmp x22, x21\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1h { z21.h }, p2/Z, [x11]\n"
+      "mov x20, x10\n"
+      "ld1h { z19.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "ld1h { z23.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "mov x19, x10\n"
+      "ld1h { z31.h }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "ld1h { z29.h }, p2/Z, [x9]\n"
+      "zip1 z0.h, z21.h, z29.h\n"
+      "ld1h { z17.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "zip2 z22.h, z21.h, z29.h\n"
+      "ld1h { z15.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "sub x22, x22, x21\n"
+      "zip1 z13.h, z19.h, z17.h\n"
+      "ld1h { z6.h }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "zip2 z12.h, z19.h, z17.h\n"
+      "ld1h { z20.h }, p2/Z, [x28]\n"
+      "cmp x22, x21\n"
+      "zip1 z14.h, z23.h, z15.h\n"
+      "ld1h { z1.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "zip2 z3.h, z23.h, z15.h\n"
+      "ld1h { z19.h }, p2/Z, [x28, #2, MUL VL]\n"
+      "zip1 z16.h, z31.h, z6.h\n"
+      "ld1h { z11.h }, p2/Z, [x28, #3, MUL VL]\n"
+      "addvl x28, x28, #4\n"
+      "zip2 z10.h, z31.h, z6.h\n"
+      "ld1h { z27.h }, p2/Z, [x27]\n"
+      "ld1h { z18.h }, p2/Z, [x27, #1, MUL VL]\n"
+      "zip1 z9.h, z20.h, z27.h\n"
+      "ld1h { z2.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "zip2 z24.h, z20.h, z27.h\n"
+      "ld1h { z5.h }, p2/Z, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "zip1 z8.h, z1.h, z18.h\n"
+      "ld1h { z30.h }, p2/Z, [x26]\n"
+      "zip2 z17.h, z1.h, z18.h\n"
+      "ld1h { z28.h }, p2/Z, [x26, #1, MUL VL]\n"
+      "zip1 z6.h, z19.h, z2.h\n"
+      "ld1h { z23.h }, p2/Z, [x26, #2, MUL VL]\n"
+      "zip2 z1.h, z19.h, z2.h\n"
+      "ld1h { z25.h }, p2/Z, [x26, #3, MUL VL]\n"
+      "addvl x26, x26, #4\n"
+      "zip1 z31.h, z11.h, z5.h\n"
+      "ld1h { z21.h }, p2/Z, [x25]\n"
+      "zip2 z11.h, z11.h, z5.h\n"
+      "ld1h { z19.h }, p2/Z, [x25, #1, MUL VL]\n"
+      "ld1h { z18.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "zip1 z29.h, z30.h, z21.h\n"
+      "ld1h { z26.h }, p2/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "zip2 z30.h, z30.h, z21.h\n"
+      "ld1h { z21.h }, p2/Z, [x24]\n"
+      "zip1 z27.h, z28.h, z19.h\n"
+      "ld1h { z20.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "zip2 z28.h, z28.h, z19.h\n"
+      "ld1h { z4.h }, p2/Z, [x24, #2, MUL VL]\n"
+      "zip1 z2.h, z23.h, z18.h\n"
+      "ld1h { z15.h }, p2/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      "zip2 z5.h, z23.h, z18.h\n"
+      "ld1h { z23.h }, p2/Z, [x23]\n"
+      "zip1 z7.h, z25.h, z26.h\n"
+      "ld1h { z19.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "zip2 z25.h, z25.h, z26.h\n"
+      "ld1h { z18.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "st1h { z0.h }, p2, [x20]\n"
+      "zip1 z0.h, z21.h, z23.h\n"
+      "zip2 z23.h, z21.h, z23.h\n"
+      "st1h { z22.h }, p2, [x20, #1, MUL VL]\n"
+      "zip1 z22.h, z20.h, z19.h\n"
+      "st1h { z13.h }, p2, [x20, #2, MUL VL]\n"
+      "zip2 z21.h, z20.h, z19.h\n"
+      "st1h { z12.h }, p2, [x20, #3, MUL VL]\n"
+      "zip1 z20.h, z4.h, z18.h\n"
+      "st1h { z9.h }, p2, [x20, #4, MUL VL]\n"
+      "zip2 z19.h, z4.h, z18.h\n"
+      "st1h { z24.h }, p2, [x20, #5, MUL VL]\n"
+      "zip1 z18.h, z15.h, z26.h\n"
+      "st1h { z8.h }, p2, [x20, #6, MUL VL]\n"
+      "zip2 z9.h, z15.h, z26.h\n"
+      "st1h { z17.h }, p2, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1h { z29.h }, p2, [x20, #-8, MUL VL]\n"
+      "st1h { z30.h }, p2, [x20, #-7, MUL VL]\n"
+      "st1h { z27.h }, p2, [x20, #-6, MUL VL]\n"
+      "st1h { z28.h }, p2, [x20, #-5, MUL VL]\n"
+      "st1h { z0.h }, p2, [x20, #-4, MUL VL]\n"
+      "st1h { z23.h }, p2, [x20, #-3, MUL VL]\n"
+      "st1h { z22.h }, p2, [x20, #-2, MUL VL]\n"
+      "st1h { z21.h }, p2, [x20, #-1, MUL VL]\n"
+      "st1h { z14.h }, p2, [x19]\n"
+      "st1h { z3.h }, p2, [x19, #1, MUL VL]\n"
+      "st1h { z16.h }, p2, [x19, #2, MUL VL]\n"
+      "st1h { z10.h }, p2, [x19, #3, MUL VL]\n"
+      "st1h { z6.h }, p2, [x19, #4, MUL VL]\n"
+      "st1h { z1.h }, p2, [x19, #5, MUL VL]\n"
+      "st1h { z31.h }, p2, [x19, #6, MUL VL]\n"
+      "st1h { z11.h }, p2, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #16\n"
+      "st1h { z2.h }, p2, [x19, #-8, MUL VL]\n"
+      "st1h { z5.h }, p2, [x19, #-7, MUL VL]\n"
+      "st1h { z7.h }, p2, [x19, #-6, MUL VL]\n"
+      "st1h { z25.h }, p2, [x19, #-5, MUL VL]\n"
+      "st1h { z20.h }, p2, [x19, #-4, MUL VL]\n"
+      "st1h { z19.h }, p2, [x19, #-3, MUL VL]\n"
+      "st1h { z18.h }, p2, [x19, #-2, MUL VL]\n"
+      "st1h { z9.h }, p2, [x19, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x22, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x22\n"
+      "mov x19, x10\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z18.h }, p1/Z, [x11]\n"
+      "ld1h { z16.h }, p1/Z, [x9]\n"
+      "zip1 z0.h, z18.h, z16.h\n"
+      "ld1h { z17.h }, p1/Z, [x28]\n"
+      "dech x20\n"
+      "zip2 z31.h, z18.h, z16.h\n"
+      "ld1h { z16.h }, p1/Z, [x27]\n"
+      "whilelt p0.h, XZR, x20\n"
+      "zip1 z30.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "addvl x11, x11, #2\n"
+      "zip2 z29.h, z17.h, z16.h\n"
+      "ld1h { z16.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "addvl x9, x9, #2\n"
+      "zip1 z28.h, z18.h, z16.h\n"
+      "ld1h { z17.h }, p0/Z, [x28, #1, MUL VL]\n"
+      "addvl x28, x28, #2\n"
+      "zip2 z27.h, z18.h, z16.h\n"
+      "ld1h { z16.h }, p0/Z, [x27, #1, MUL VL]\n"
+      "addvl x27, x27, #2\n"
+      "zip1 z26.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p1/Z, [x26]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "zip2 z25.h, z17.h, z16.h\n"
+      "ld1h { z19.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "ld1h { z17.h }, p1/Z, [x25]\n"
+      "zip1 z24.h, z18.h, z17.h\n"
+      "ld1h { z16.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "addvl x25, x25, #2\n"
+      "zip2 z23.h, z18.h, z17.h\n"
+      "ld1h { z18.h }, p1/Z, [x24]\n"
+      "decw x22, ALL, MUL #4\n"
+      "zip1 z22.h, z19.h, z16.h\n"
+      "ld1h { z21.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "addvl x24, x24, #2\n"
+      "zip2 z20.h, z19.h, z16.h\n"
+      "ld1h { z17.h }, p1/Z, [x23]\n"
+      "cmp x22, #0x0\n"
+      "zip1 z19.h, z18.h, z17.h\n"
+      "ld1h { z16.h }, p0/Z, [x23, #1, MUL VL]\n"
+      "addvl x23, x23, #2\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "st1h { z0.h }, p2, [x19]\n"
+      "st1h { z31.h }, p2, [x19, #1, MUL VL]\n"
+      "zip1 z17.h, z21.h, z16.h\n"
+      "st1h { z28.h }, p2, [x19, #2, MUL VL]\n"
+      "zip2 z16.h, z21.h, z16.h\n"
+      "st1h { z27.h }, p2, [x19, #3, MUL VL]\n"
+      "st1h { z30.h }, p2, [x19, #4, MUL VL]\n"
+      "st1h { z29.h }, p2, [x19, #5, MUL VL]\n"
+      "st1h { z26.h }, p2, [x19, #6, MUL VL]\n"
+      "st1h { z25.h }, p2, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #16\n"
+      "st1h { z24.h }, p2, [x19, #-8, MUL VL]\n"
+      "st1h { z23.h }, p2, [x19, #-7, MUL VL]\n"
+      "st1h { z22.h }, p2, [x19, #-6, MUL VL]\n"
+      "st1h { z20.h }, p2, [x19, #-5, MUL VL]\n"
+      "st1h { z19.h }, p2, [x19, #-4, MUL VL]\n"
+      "st1h { z18.h }, p2, [x19, #-3, MUL VL]\n"
+      "st1h { z17.h }, p2, [x19, #-2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #16\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+
+      "7:"  // Tail row loop: Head
+      "mov x11, %x[in]\n"
+      "mov x10, %x[out]\n"
+      "add x9, x11, %x[in_stride]\n"
+      "add %x[in], x9, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "csel x9, x9, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x20, %x[width]\n"
+      "cnth x19, ALL, MUL #4\n"
+      "cmp x20, x19\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1h { z18.h }, p2/Z, [x11]\n"
+      "sub x20, x20, x19\n"
+      "ld1h { z24.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "cmp x20, x19\n"
+      "ld1h { z23.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1h { z22.h }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "ld1h { z16.h }, p2/Z, [x9]\n"
+      "zip1 z21.h, z18.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "zip2 z16.h, z18.h, z16.h\n"
+      "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n"
+      "zip1 z18.h, z24.h, z17.h\n"
+      "st1h { z21.h }, p2, [x10]\n"
+      "addvl x9, x9, #4\n"
+      "zip2 z17.h, z24.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #1, MUL VL]\n"
+      "zip1 z16.h, z23.h, z20.h\n"
+      "st1h { z18.h }, p2, [x10, #2, MUL VL]\n"
+      "zip2 z18.h, z23.h, z20.h\n"
+      "st1h { z17.h }, p2, [x10, #3, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "zip1 z17.h, z22.h, z19.h\n"
+      "st1h { z16.h }, p2, [x10]\n"
+      "zip2 z16.h, z22.h, z19.h\n"
+      "st1h { z18.h }, p2, [x10, #1, MUL VL]\n"
+      "st1h { z17.h }, p2, [x10, #2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x10, #3, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x20, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x19, x20\n"
+      "decw x20, ALL, MUL #4\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z17.h }, p0/Z, [x11]\n"
+      "ld1h { z16.h }, p0/Z, [x9]\n"
+      "zip1 z20.h, z17.h, z16.h\n"
+      "dech x19\n"
+      "zip2 z19.h, z17.h, z16.h\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z18.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "addvl x11, x11, #2\n"
+      "ld1h { z16.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "zip1 z17.h, z18.h, z16.h\n"
+      "st1h { z20.h }, p2, [x10]\n"
+      "addvl x9, x9, #2\n"
+      "zip2 z16.h, z18.h, z16.h\n"
+      "st1h { z19.h }, p2, [x10, #1, MUL VL]\n"
+      "cmp x20, #0x0\n"
+      "st1h { z17.h }, p2, [x10, #2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x10, #3, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "addvl %x[out], %x[out], #4\n"
+      "cmp %x[height], #0x1\n"
+      "bge 7b\n"
+      "12:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<4, 2, true, VLType::SVE>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_4VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
new file mode 100644
index 0000000000..67ef738645
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_1x8.hpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_6VL_1x8(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 8) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 6 * roundup<size_t>(height, 8) * get_vector_length<uint64_t>();
+
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x28, %x[out]\n"
+      "add x27, x9, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add %x[in], x21, %x[in_stride]\n"
+      "cmp %x[height], #0x7\n"
+      "csel x21, x21, %x[pad_row], GT\n"
+      "csel x22, x22, %x[pad_row], GE\n"
+      "cmp %x[height], #0x5\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "csel x24, x24, %x[pad_row], GE\n"
+      "cmp %x[height], #0x3\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "csel x26, x26, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x27, x27, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x20, %x[width]\n"
+      "cntb x19, ALL, MUL #3\n"
+      "cmp x20, x19\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z22.b }, p1/Z, [x9]\n"
+      "sub x20, x20, x19\n"
+      "ld1b { z21.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "cmp x20, x19\n"
+      "ld1b { z12.b }, p1/Z, [x9, #2, MUL VL]\n"
+      "addvl x9, x9, #3\n"
+      "ld1b { z20.b }, p1/Z, [x27]\n"
+      "ld1b { z11.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "ld1b { z10.b }, p1/Z, [x27, #2, MUL VL]\n"
+      "addvl x27, x27, #3\n"
+      "ld1b { z19.b }, p1/Z, [x26]\n"
+      "ld1b { z9.b }, p1/Z, [x26, #1, MUL VL]\n"
+      "ld1b { z8.b }, p1/Z, [x26, #2, MUL VL]\n"
+      "addvl x26, x26, #3\n"
+      "ld1b { z7.b }, p1/Z, [x25]\n"
+      "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
+      "ld1b { z5.b }, p1/Z, [x25, #2, MUL VL]\n"
+      "addvl x25, x25, #3\n"
+      "ld1b { z16.b }, p1/Z, [x24]\n"
+      "zip1 z18.b, z22.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x24, #1, MUL VL]\n"
+      "zip2 z4.b, z22.b, z16.b\n"
+      "ld1b { z3.b }, p1/Z, [x24, #2, MUL VL]\n"
+      "addvl x24, x24, #3\n"
+      "zip1 z2.b, z21.b, z17.b\n"
+      "ld1b { z16.b }, p1/Z, [x23]\n"
+      "zip2 z1.b, z21.b, z17.b\n"
+      "ld1b { z0.b }, p1/Z, [x23, #1, MUL VL]\n"
+      "zip1 z31.b, z12.b, z3.b\n"
+      "ld1b { z30.b }, p1/Z, [x23, #2, MUL VL]\n"
+      "addvl x23, x23, #3\n"
+      "zip1 z29.b, z20.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x22]\n"
+      "zip2 z28.b, z20.b, z16.b\n"
+      "ld1b { z27.b }, p1/Z, [x22, #1, MUL VL]\n"
+      "zip1 z26.b, z11.b, z0.b\n"
+      "ld1b { z25.b }, p1/Z, [x22, #2, MUL VL]\n"
+      "addvl x22, x22, #3\n"
+      "zip1 z16.b, z19.b, z17.b\n"
+      "ld1b { z24.b }, p1/Z, [x21]\n"
+      "zip2 z21.b, z19.b, z17.b\n"
+      "ld1b { z22.b }, p1/Z, [x21, #1, MUL VL]\n"
+      "zip1 z20.b, z18.b, z16.b\n"
+      "ld1b { z23.b }, p1/Z, [x21, #2, MUL VL]\n"
+      "addvl x21, x21, #3\n"
+      "zip1 z19.b, z7.b, z24.b\n"
+      "zip2 z18.b, z18.b, z16.b\n"
+      "zip1 z17.b, z29.b, z19.b\n"
+      "zip1 z16.b, z20.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z20.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip2 z17.b, z29.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip1 z18.b, z4.b, z21.b\n"
+      "zip2 z19.b, z7.b, z24.b\n"
+      "zip1 z17.b, z28.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip2 z18.b, z4.b, z21.b\n"
+      "zip2 z17.b, z28.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip1 z20.b, z9.b, z27.b\n"
+      "zip1 z18.b, z2.b, z20.b\n"
+      "zip1 z19.b, z6.b, z22.b\n"
+      "zip1 z17.b, z26.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip2 z18.b, z2.b, z20.b\n"
+      "zip2 z17.b, z26.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip2 z21.b, z9.b, z27.b\n"
+      "zip2 z20.b, z11.b, z0.b\n"
+      "zip1 z18.b, z1.b, z21.b\n"
+      "zip2 z19.b, z6.b, z22.b\n"
+      "zip1 z17.b, z20.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip2 z18.b, z1.b, z21.b\n"
+      "zip2 z17.b, z20.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip1 z21.b, z8.b, z25.b\n"
+      "zip1 z18.b, z31.b, z21.b\n"
+      "zip1 z20.b, z10.b, z30.b\n"
+      "zip1 z19.b, z5.b, z23.b\n"
+      "zip1 z17.b, z20.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip2 z18.b, z31.b, z21.b\n"
+      "zip2 z17.b, z20.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip2 z22.b, z12.b, z3.b\n"
+      "zip2 z21.b, z8.b, z25.b\n"
+      "zip1 z18.b, z22.b, z21.b\n"
+      "zip2 z20.b, z10.b, z30.b\n"
+      "zip2 z19.b, z5.b, z23.b\n"
+      "zip1 z17.b, z20.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip2 z18.b, z22.b, z21.b\n"
+      "zip2 z17.b, z20.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x20, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.b, XZR, x20\n"
+      "ld1b { z18.b }, p0/Z, [x9]\n"
+      "incd x9, ALL, MUL #6\n"
+      "ld1b { z28.b }, p0/Z, [x27]\n"
+      "incd x27, ALL, MUL #6\n"
+      "ld1b { z17.b }, p0/Z, [x26]\n"
+      "incd x26, ALL, MUL #6\n"
+      "ld1b { z27.b }, p0/Z, [x25]\n"
+      "incd x25, ALL, MUL #6\n"
+      "ld1b { z16.b }, p0/Z, [x24]\n"
+      "zip1 z26.b, z18.b, z16.b\n"
+      "ld1b { z25.b }, p0/Z, [x23]\n"
+      "incd x24, ALL, MUL #6\n"
+      "zip2 z24.b, z18.b, z16.b\n"
+      "ld1b { z16.b }, p0/Z, [x22]\n"
+      "incd x23, ALL, MUL #6\n"
+      "zip1 z23.b, z28.b, z25.b\n"
+      "ld1b { z22.b }, p0/Z, [x21]\n"
+      "incd x22, ALL, MUL #6\n"
+      "zip1 z21.b, z17.b, z16.b\n"
+      "incd x21, ALL, MUL #6\n"
+      "zip2 z20.b, z17.b, z16.b\n"
+      "decd x20, ALL, MUL #6\n"
+      "zip1 z18.b, z26.b, z21.b\n"
+      "cmp x20, #0x0\n"
+      "zip1 z19.b, z27.b, z22.b\n"
+      "zip1 z17.b, z23.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip2 z18.b, z26.b, z21.b\n"
+      "zip2 z17.b, z23.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip1 z18.b, z24.b, z20.b\n"
+      "zip2 z17.b, z28.b, z25.b\n"
+      "zip2 z16.b, z27.b, z22.b\n"
+      "zip1 z17.b, z17.b, z16.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #6\n"
+      "cmp %x[height], #0x1\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<6, 8, true, VLType::SVE>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_6VL_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<6, 8, true, VLType::SVE>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_6VL_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
new file mode 100644
index 0000000000..19d3d9dfe4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4.hpp
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_6VL_2x4(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 6 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "cmp %x[height], #0x8\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x11, %x[in]\n"
+      "mov x10, %x[out]\n"
+      "add x9, x11, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x22, %x[width]\n"
+      "cnth x21, ALL, MUL #3\n"
+      "cmp x22, x21\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1h { z19.h }, p2/Z, [x11]\n"
+      "mov x20, x10\n"
+      "ld1h { z18.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "ld1h { z10.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "addvl x11, x11, #3\n"
+      "ld1h { z24.h }, p2/Z, [x9]\n"
+      "mov x19, x10\n"
+      "ld1h { z23.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "ld1h { z9.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "addvl x9, x9, #3\n"
+      "ld1h { z16.h }, p2/Z, [x28]\n"
+      "zip1 z22.h, z19.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "sub x22, x22, x21\n"
+      "zip2 z21.h, z19.h, z16.h\n"
+      "ld1h { z8.h }, p2/Z, [x28, #2, MUL VL]\n"
+      "addvl x28, x28, #3\n"
+      "zip1 z20.h, z18.h, z17.h\n"
+      "ld1h { z16.h }, p2/Z, [x27]\n"
+      "cmp x22, x21\n"
+      "zip2 z7.h, z18.h, z17.h\n"
+      "ld1h { z19.h }, p2/Z, [x27, #1, MUL VL]\n"
+      "zip1 z6.h, z10.h, z8.h\n"
+      "ld1h { z5.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "addvl x27, x27, #3\n"
+      "zip1 z17.h, z24.h, z16.h\n"
+      "ld1h { z18.h }, p2/Z, [x26]\n"
+      "zip2 z16.h, z24.h, z16.h\n"
+      "ld1h { z4.h }, p2/Z, [x26, #1, MUL VL]\n"
+      "zip1 z3.h, z22.h, z17.h\n"
+      "ld1h { z2.h }, p2/Z, [x26, #2, MUL VL]\n"
+      "addvl x26, x26, #3\n"
+      "zip2 z1.h, z22.h, z17.h\n"
+      "ld1h { z0.h }, p2/Z, [x25]\n"
+      "zip1 z31.h, z21.h, z16.h\n"
+      "ld1h { z30.h }, p2/Z, [x25, #1, MUL VL]\n"
+      "zip2 z29.h, z21.h, z16.h\n"
+      "ld1h { z28.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "addvl x25, x25, #3\n"
+      "zip1 z16.h, z23.h, z19.h\n"
+      "ld1h { z17.h }, p2/Z, [x24]\n"
+      "zip2 z27.h, z23.h, z19.h\n"
+      "ld1h { z26.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "zip1 z25.h, z20.h, z16.h\n"
+      "ld1h { z24.h }, p2/Z, [x24, #2, MUL VL]\n"
+      "addvl x24, x24, #3\n"
+      "zip2 z23.h, z20.h, z16.h\n"
+      "ld1h { z16.h }, p2/Z, [x23]\n"
+      "zip1 z20.h, z18.h, z17.h\n"
+      "ld1h { z22.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "zip2 z19.h, z18.h, z17.h\n"
+      "ld1h { z21.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "addvl x23, x23, #3\n"
+      "zip1 z18.h, z0.h, z16.h\n"
+      "st1h { z3.h }, p2, [x20]\n"
+      "zip2 z17.h, z0.h, z16.h\n"
+      "st1h { z1.h }, p2, [x20, #1, MUL VL]\n"
+      "zip1 z16.h, z20.h, z18.h\n"
+      "st1h { z31.h }, p2, [x20, #2, MUL VL]\n"
+      "zip2 z18.h, z20.h, z18.h\n"
+      "st1h { z29.h }, p2, [x20, #3, MUL VL]\n"
+      "zip1 z20.h, z19.h, z17.h\n"
+      "st1h { z25.h }, p2, [x20, #4, MUL VL]\n"
+      "zip2 z19.h, z19.h, z17.h\n"
+      "st1h { z23.h }, p2, [x20, #5, MUL VL]\n"
+      "zip1 z17.h, z4.h, z26.h\n"
+      "st1h { z16.h }, p2, [x20, #6, MUL VL]\n"
+      "zip1 z16.h, z30.h, z22.h\n"
+      "st1h { z18.h }, p2, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #12\n"
+      "zip1 z18.h, z17.h, z16.h\n"
+      "st1h { z20.h }, p2, [x20, #-4, MUL VL]\n"
+      "zip2 z16.h, z17.h, z16.h\n"
+      "st1h { z19.h }, p2, [x20, #-3, MUL VL]\n"
+      "zip1 z17.h, z7.h, z27.h\n"
+      "st1h { z18.h }, p2, [x20, #-2, MUL VL]\n"
+      "zip2 z18.h, z7.h, z27.h\n"
+      "st1h { z16.h }, p2, [x20, #-1, MUL VL]\n"
+      "zip1 z16.h, z9.h, z5.h\n"
+      "st1h { z17.h }, p2, [x19]\n"
+      "zip1 z17.h, z6.h, z16.h\n"
+      "st1h { z18.h }, p2, [x19, #1, MUL VL]\n"
+      "zip2 z16.h, z6.h, z16.h\n"
+      "st1h { z17.h }, p2, [x19, #2, MUL VL]\n"
+      "zip2 z18.h, z10.h, z8.h\n"
+      "st1h { z16.h }, p2, [x19, #3, MUL VL]\n"
+      "zip2 z17.h, z9.h, z5.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #4, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #5, MUL VL]\n"
+      "zip2 z18.h, z4.h, z26.h\n"
+      "zip2 z17.h, z30.h, z22.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #6, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #12\n"
+      "zip1 z18.h, z2.h, z24.h\n"
+      "zip1 z17.h, z28.h, z21.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-4, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-3, MUL VL]\n"
+      "zip2 z18.h, z2.h, z24.h\n"
+      "zip2 z17.h, z28.h, z21.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-2, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x22, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x22\n"
+      "mov x19, x10\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z18.h }, p1/Z, [x11]\n"
+      "ld1h { z23.h }, p1/Z, [x9]\n"
+      "dech x20\n"
+      "ld1h { z16.h }, p1/Z, [x28]\n"
+      "zip1 z17.h, z18.h, z16.h\n"
+      "ld1h { z20.h }, p1/Z, [x27]\n"
+      "whilelt p0.h, XZR, x20\n"
+      "zip2 z22.h, z18.h, z16.h\n"
+      "ld1h { z21.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "addvl x11, x11, #1\n"
+      "zip1 z16.h, z23.h, z20.h\n"
+      "ld1h { z19.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "incd x11, ALL, MUL #4\n"
+      "zip1 z0.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p0/Z, [x28, #1, MUL VL]\n"
+      "addvl x9, x9, #1\n"
+      "zip2 z31.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p0/Z, [x27, #1, MUL VL]\n"
+      "incd x9, ALL, MUL #4\n"
+      "zip2 z16.h, z23.h, z20.h\n"
+      "ld1h { z30.h }, p1/Z, [x26]\n"
+      "addvl x28, x28, #1\n"
+      "zip1 z20.h, z22.h, z16.h\n"
+      "ld1h { z29.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "incd x28, ALL, MUL #4\n"
+      "zip2 z28.h, z22.h, z16.h\n"
+      "ld1h { z27.h }, p1/Z, [x25]\n"
+      "addvl x27, x27, #1\n"
+      "zip1 z18.h, z21.h, z18.h\n"
+      "ld1h { z26.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "incd x27, ALL, MUL #4\n"
+      "zip1 z17.h, z19.h, z17.h\n"
+      "ld1h { z16.h }, p1/Z, [x24]\n"
+      "addvl x26, x26, #1\n"
+      "zip1 z25.h, z18.h, z17.h\n"
+      "ld1h { z24.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "incd x26, ALL, MUL #4\n"
+      "zip2 z23.h, z18.h, z17.h\n"
+      "ld1h { z22.h }, p1/Z, [x23]\n"
+      "addvl x25, x25, #1\n"
+      "zip1 z19.h, z30.h, z16.h\n"
+      "ld1h { z21.h }, p0/Z, [x23, #1, MUL VL]\n"
+      "incd x25, ALL, MUL #4\n"
+      "zip2 z17.h, z30.h, z16.h\n"
+      "st1h { z0.h }, p2, [x19]\n"
+      "addvl x24, x24, #1\n"
+      "zip1 z16.h, z27.h, z22.h\n"
+      "st1h { z31.h }, p2, [x19, #1, MUL VL]\n"
+      "incd x24, ALL, MUL #4\n"
+      "zip1 z18.h, z19.h, z16.h\n"
+      "st1h { z20.h }, p2, [x19, #2, MUL VL]\n"
+      "addvl x23, x23, #1\n"
+      "zip2 z20.h, z19.h, z16.h\n"
+      "st1h { z28.h }, p2, [x19, #3, MUL VL]\n"
+      "incd x23, ALL, MUL #4\n"
+      "zip2 z16.h, z27.h, z22.h\n"
+      "st1h { z25.h }, p2, [x19, #4, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "zip1 z19.h, z17.h, z16.h\n"
+      "st1h { z23.h }, p2, [x19, #5, MUL VL]\n"
+      "decd x22, ALL, MUL #6\n"
+      "zip2 z17.h, z17.h, z16.h\n"
+      "st1h { z18.h }, p2, [x19, #6, MUL VL]\n"
+      "cmp x22, #0x0\n"
+      "zip1 z18.h, z29.h, z24.h\n"
+      "st1h { z20.h }, p2, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #12\n"
+      "zip1 z16.h, z26.h, z21.h\n"
+      "st1h { z19.h }, p2, [x19, #-4, MUL VL]\n"
+      "st1h { z17.h }, p2, [x19, #-3, MUL VL]\n"
+      "zip1 z17.h, z18.h, z16.h\n"
+      "zip2 z16.h, z18.h, z16.h\n"
+      "st1h { z17.h }, p2, [x19, #-2, MUL VL]\n"
+      "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #12\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+
+      "7:"  // Tail row loop: Head
+      "mov x11, %x[in]\n"
+      "mov x10, %x[out]\n"
+      "add x9, x11, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add %x[in], x27, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x27, x27, %x[pad_row], GT\n"
+      "csel x28, x28, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x9, x9, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x20, %x[width]\n"
+      "cnth x19, ALL, MUL #3\n"
+      "cmp x20, x19\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1h { z19.h }, p2/Z, [x11]\n"
+      "sub x20, x20, x19\n"
+      "ld1h { z18.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "cmp x20, x19\n"
+      "ld1h { z30.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "addvl x11, x11, #3\n"
+      "ld1h { z29.h }, p2/Z, [x9]\n"
+      "ld1h { z28.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z27.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "addvl x9, x9, #3\n"
+      "ld1h { z16.h }, p2/Z, [x28]\n"
+      "zip1 z26.h, z19.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "zip2 z25.h, z19.h, z16.h\n"
+      "ld1h { z24.h }, p2/Z, [x28, #2, MUL VL]\n"
+      "addvl x28, x28, #3\n"
+      "zip1 z23.h, z18.h, z17.h\n"
+      "ld1h { z16.h }, p2/Z, [x27]\n"
+      "zip2 z22.h, z18.h, z17.h\n"
+      "ld1h { z21.h }, p2/Z, [x27, #1, MUL VL]\n"
+      "zip1 z20.h, z30.h, z24.h\n"
+      "ld1h { z19.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "addvl x27, x27, #3\n"
+      "zip1 z18.h, z29.h, z16.h\n"
+      "zip2 z17.h, z29.h, z16.h\n"
+      "zip1 z16.h, z26.h, z18.h\n"
+      "st1h { z16.h }, p2, [x10]\n"
+      "zip2 z16.h, z26.h, z18.h\n"
+      "st1h { z16.h }, p2, [x10, #1, MUL VL]\n"
+      "zip1 z16.h, z25.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #2, MUL VL]\n"
+      "zip2 z16.h, z25.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #3, MUL VL]\n"
+      "zip1 z17.h, z28.h, z21.h\n"
+      "zip1 z16.h, z23.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #4, MUL VL]\n"
+      "zip2 z16.h, z23.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #5, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "zip2 z18.h, z28.h, z21.h\n"
+      "zip1 z17.h, z27.h, z19.h\n"
+      "zip1 z16.h, z22.h, z18.h\n"
+      "st1h { z16.h }, p2, [x10]\n"
+      "zip2 z16.h, z22.h, z18.h\n"
+      "st1h { z16.h }, p2, [x10, #1, MUL VL]\n"
+      "zip1 z16.h, z20.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #2, MUL VL]\n"
+      "zip2 z16.h, z20.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #3, MUL VL]\n"
+      "zip2 z18.h, z30.h, z24.h\n"
+      "zip2 z17.h, z27.h, z19.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #4, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #5, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x20, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x19, x20\n"
+      "decd x20, ALL, MUL #6\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z17.h }, p0/Z, [x11]\n"
+      "ld1h { z25.h }, p0/Z, [x9]\n"
+      "dech x19\n"
+      "ld1h { z16.h }, p0/Z, [x28]\n"
+      "zip1 z18.h, z17.h, z16.h\n"
+      "ld1h { z24.h }, p0/Z, [x27]\n"
+      "whilelt p0.h, XZR, x19\n"
+      "zip2 z23.h, z17.h, z16.h\n"
+      "ld1h { z22.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "addvl x11, x11, #1\n"
+      "zip1 z16.h, z25.h, z24.h\n"
+      "ld1h { z21.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "incd x11, ALL, MUL #4\n"
+      "zip1 z17.h, z18.h, z16.h\n"
+      "ld1h { z20.h }, p0/Z, [x28, #1, MUL VL]\n"
+      "addvl x9, x9, #1\n"
+      "zip2 z18.h, z18.h, z16.h\n"
+      "ld1h { z19.h }, p0/Z, [x27, #1, MUL VL]\n"
+      "incd x9, ALL, MUL #4\n"
+      "zip2 z16.h, z25.h, z24.h\n"
+      "st1h { z17.h }, p2, [x10]\n"
+      "addvl x28, x28, #1\n"
+      "zip1 z17.h, z23.h, z16.h\n"
+      "st1h { z18.h }, p2, [x10, #1, MUL VL]\n"
+      "incd x28, ALL, MUL #4\n"
+      "zip2 z16.h, z23.h, z16.h\n"
+      "st1h { z17.h }, p2, [x10, #2, MUL VL]\n"
+      "addvl x27, x27, #1\n"
+      "zip1 z18.h, z22.h, z20.h\n"
+      "st1h { z16.h }, p2, [x10, #3, MUL VL]\n"
+      "incd x27, ALL, MUL #4\n"
+      "zip1 z17.h, z21.h, z19.h\n"
+      "cmp x20, #0x0\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #4, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #5, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "addvl %x[out], %x[out], #6\n"
+      "cmp %x[height], #0x1\n"
+      "bge 7b\n"
+      "12:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<6, 4, true, VLType::SVE>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_6VL_2x4(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..94ce157185
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_2x4_fp32bf16.hpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_6VL_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 6 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p3.b\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "mov x22, %x[width]\n"
+      "cnth x19, ALL, MUL #3\n"
+      "add x21, x23, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x21, %x[in_stride]\n"
+      "csel x21, x21, %x[pad_row], GT\n"
+      "csel x23, x23, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "cmp x22, x19\n"
+      "mov x20, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z17.s }, p3/Z, [x25]\n"
+      "ld1w { z18.s }, p3/Z, [x25, #1, MUL VL]\n"
+      "sub x22, x22, x19\n"
+      "cmp x22, x19\n"
+      "ld1w { z19.s }, p3/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23]\n"
+      "zip1 z21.s, z17.s, z16.s\n"
+      "zip2 z20.s, z17.s, z16.s\n"
+      "ld1w { z17.s }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23, #2, MUL VL]\n"
+      "zip1 z29.s, z18.s, z17.s\n"
+      "zip2 z28.s, z18.s, z17.s\n"
+      "ld1w { z17.s }, p3/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z18.s }, p3/Z, [x25, #4, MUL VL]\n"
+      "zip1 z27.s, z19.s, z16.s\n"
+      "zip2 z26.s, z19.s, z16.s\n"
+      "ld1w { z19.s }, p3/Z, [x25, #5, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23, #3, MUL VL]\n"
+      "zip1 z25.s, z17.s, z16.s\n"
+      "zip2 z24.s, z17.s, z16.s\n"
+      "ld1w { z17.s }, p3/Z, [x23, #4, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x23, #5, MUL VL]\n"
+      "zip1 z12.s, z18.s, z17.s\n"
+      "zip2 z11.s, z18.s, z17.s\n"
+      "ld1w { z18.s }, p3/Z, [x24]\n"
+      "ld1w { z23.s }, p3/Z, [x24, #1, MUL VL]\n"
+      "zip1 z10.s, z19.s, z16.s\n"
+      "zip2 z9.s, z19.s, z16.s\n"
+      "ld1w { z22.s }, p3/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z17.s }, p3/Z, [x21]\n"
+      ".inst 0x658aaea8  // bfcvt z8.h, p3/M, z21.s\n"
+      "zip1 z7.s, z18.s, z17.s\n"
+      "ld1w { z16.s }, p3/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z21.s }, p3/Z, [x21, #2, MUL VL]\n"
+      ".inst 0x658aae86  // bfcvt z6.h, p3/M, z20.s\n"
+      "zip2 z5.s, z18.s, z17.s\n"
+      "ld1w { z20.s }, p3/Z, [x24, #3, MUL VL]\n"
+      "ld1w { z19.s }, p3/Z, [x24, #4, MUL VL]\n"
+      ".inst 0x658aafa4  // bfcvt z4.h, p3/M, z29.s\n"
+      "zip1 z3.s, z23.s, z16.s\n"
+      "ld1w { z2.s }, p3/Z, [x24, #5, MUL VL]\n"
+      "ld1w { z18.s }, p3/Z, [x21, #3, MUL VL]\n"
+      ".inst 0x658aaf81  // bfcvt z1.h, p3/M, z28.s\n"
+      "zip2 z0.s, z23.s, z16.s\n"
+      "ld1w { z17.s }, p3/Z, [x21, #4, MUL VL]\n"
+      "ld1w { z16.s }, p3/Z, [x21, #5, MUL VL]\n"
+      ".inst 0x658aaf7f  // bfcvt z31.h, p3/M, z27.s\n"
+      "zip1 z30.s, z22.s, z21.s\n"
+      ".inst 0x658aaf5d  // bfcvt z29.h, p3/M, z26.s\n"
+      "zip2 z28.s, z22.s, z21.s\n"
+      "addvl x25, x25, #6\n"
+      "addvl x24, x24, #6\n"
+      ".inst 0x658aaf3b  // bfcvt z27.h, p3/M, z25.s\n"
+      "zip1 z26.s, z20.s, z18.s\n"
+      "addvl x23, x23, #6\n"
+      "addvl x21, x21, #6\n"
+      ".inst 0x658aaf19  // bfcvt z25.h, p3/M, z24.s\n"
+      "zip2 z24.s, z20.s, z18.s\n"
+      ".inst 0x658aad97  // bfcvt z23.h, p3/M, z12.s\n"
+      "zip1 z22.s, z19.s, z17.s\n"
+      ".inst 0x658aad75  // bfcvt z21.h, p3/M, z11.s\n"
+      "zip2 z20.s, z19.s, z17.s\n"
+      ".inst 0x658aad53  // bfcvt z19.h, p3/M, z10.s\n"
+      "zip1 z18.s, z2.s, z16.s\n"
+      ".inst 0x658aad31  // bfcvt z17.h, p3/M, z9.s\n"
+      "zip2 z16.s, z2.s, z16.s\n"
+      ".inst 0x648aace8  // bfcvtnt z8.h, p3/M, z7.s\n"
+      ".inst 0x648aaca6  // bfcvtnt z6.h, p3/M, z5.s\n"
+      "st1h { z8.h }, p3, [x20]\n"
+      ".inst 0x648aac64  // bfcvtnt z4.h, p3/M, z3.s\n"
+      ".inst 0x648aac01  // bfcvtnt z1.h, p3/M, z0.s\n"
+      "st1h { z6.h }, p3, [x20, #1, MUL VL]\n"
+      ".inst 0x648aafdf  // bfcvtnt z31.h, p3/M, z30.s\n"
+      ".inst 0x648aaf9d  // bfcvtnt z29.h, p3/M, z28.s\n"
+      "st1h { z4.h }, p3, [x20, #2, MUL VL]\n"
+      "st1h { z1.h }, p3, [x20, #3, MUL VL]\n"
+      ".inst 0x648aaf5b  // bfcvtnt z27.h, p3/M, z26.s\n"
+      ".inst 0x648aaf19  // bfcvtnt z25.h, p3/M, z24.s\n"
+      "st1h { z31.h }, p3, [x20, #4, MUL VL]\n"
+      ".inst 0x648aaed7  // bfcvtnt z23.h, p3/M, z22.s\n"
+      ".inst 0x648aae95  // bfcvtnt z21.h, p3/M, z20.s\n"
+      "st1h { z29.h }, p3, [x20, #5, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      ".inst 0x648aae53  // bfcvtnt z19.h, p3/M, z18.s\n"
+      ".inst 0x648aae11  // bfcvtnt z17.h, p3/M, z16.s\n"
+      "st1h { z27.h }, p3, [x20]\n"
+      "st1h { z25.h }, p3, [x20, #1, MUL VL]\n"
+      "st1h { z23.h }, p3, [x20, #2, MUL VL]\n"
+      "st1h { z21.h }, p3, [x20, #3, MUL VL]\n"
+      "st1h { z19.h }, p3, [x20, #4, MUL VL]\n"
+      "st1h { z17.h }, p3, [x20, #5, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x22, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x19, x22\n"
+      "whilelt p2.s, XZR, x19\n"
+      "ld1w { z20.s }, p2/Z, [x25]\n"
+      "ld1w { z19.s }, p2/Z, [x23]\n"
+      "decw x19\n"
+      "whilelt p1.s, XZR, x19\n"
+      "ld1w { z18.s }, p1/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z17.s }, p1/Z, [x23, #1, MUL VL]\n"
+      "decw x19\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z25.s }, p0/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z16.s }, p0/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z24.s }, p2/Z, [x24]\n"
+      "ld1w { z30.s }, p1/Z, [x24, #1, MUL VL]\n"
+      "zip1 z23.s, z20.s, z19.s\n"
+      "zip2 z22.s, z20.s, z19.s\n"
+      "ld1w { z29.s }, p0/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z21.s }, p2/Z, [x21]\n"
+      "zip1 z20.s, z18.s, z17.s\n"
+      "zip2 z19.s, z18.s, z17.s\n"
+      "ld1w { z18.s }, p1/Z, [x21, #1, MUL VL]\n"
+      "ld1w { z28.s }, p0/Z, [x21, #2, MUL VL]\n"
+      "zip1 z17.s, z25.s, z16.s\n"
+      "zip2 z16.s, z25.s, z16.s\n"
+      "decd x22, ALL, MUL #6\n"
+      ".inst 0x658aaefb  // bfcvt z27.h, p3/M, z23.s\n"
+      "zip1 z26.s, z24.s, z21.s\n"
+      "cmp x22, #0x0\n"
+      ".inst 0x658aaed9  // bfcvt z25.h, p3/M, z22.s\n"
+      "zip2 z24.s, z24.s, z21.s\n"
+      "addvl x25, x25, #3\n"
+      "addvl x24, x24, #3\n"
+      ".inst 0x658aae97  // bfcvt z23.h, p3/M, z20.s\n"
+      "zip1 z22.s, z30.s, z18.s\n"
+      "addvl x23, x23, #3\n"
+      "addvl x21, x21, #3\n"
+      ".inst 0x658aae75  // bfcvt z21.h, p3/M, z19.s\n"
+      "zip2 z20.s, z30.s, z18.s\n"
+      ".inst 0x658aae33  // bfcvt z19.h, p3/M, z17.s\n"
+      "zip1 z18.s, z29.s, z28.s\n"
+      ".inst 0x658aae11  // bfcvt z17.h, p3/M, z16.s\n"
+      "zip2 z16.s, z29.s, z28.s\n"
+      ".inst 0x648aaf5b  // bfcvtnt z27.h, p3/M, z26.s\n"
+      ".inst 0x648aaf19  // bfcvtnt z25.h, p3/M, z24.s\n"
+      "st1h { z27.h }, p3, [x20]\n"
+      ".inst 0x648aaed7  // bfcvtnt z23.h, p3/M, z22.s\n"
+      ".inst 0x648aae95  // bfcvtnt z21.h, p3/M, z20.s\n"
+      "st1h { z25.h }, p3, [x20, #1, MUL VL]\n"
+      ".inst 0x648aae53  // bfcvtnt z19.h, p3/M, z18.s\n"
+      ".inst 0x648aae11  // bfcvtnt z17.h, p3/M, z16.s\n"
+      "st1h { z23.h }, p3, [x20, #2, MUL VL]\n"
+      "st1h { z21.h }, p3, [x20, #3, MUL VL]\n"
+      "st1h { z19.h }, p3, [x20, #4, MUL VL]\n"
+      "st1h { z17.h }, p3, [x20, #5, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #6\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<6, 4, true, VLType::SVE>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_6VL_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
new file mode 100644
index 0000000000..46b160b071
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_6VL_4x2.hpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_6VL_4x2(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint32_t *pad_row = reinterpret_cast<uint32_t *>(alloca(width * sizeof(uint32_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint32_t));
+    }
+
+    size_t out_stride = 6 * roundup<size_t>(height, 2) * get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "cmp %x[height], #0x4\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x27, %x[in]\n"
+      "mov x26, %x[out]\n"
+      "add x25, x27, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x22, %x[width]\n"
+      "cntw x21, ALL, MUL #6\n"
+      "cmp x22, x21\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z19.s }, p2/Z, [x27]\n"
+      "mov x20, x26\n"
+      "ld1w { z18.s }, p2/Z, [x27, #1, MUL VL]\n"
+      "add x26, x26, %x[out_stride]\n"
+      "ld1w { z21.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "mov x19, x26\n"
+      "ld1w { z26.s }, p2/Z, [x27, #3, MUL VL]\n"
+      "add x26, x26, %x[out_stride]\n"
+      "ld1w { z25.s }, p2/Z, [x27, #4, MUL VL]\n"
+      "sub x22, x22, x21\n"
+      "ld1w { z24.s }, p2/Z, [x27, #5, MUL VL]\n"
+      "addvl x27, x27, #6\n"
+      "ld1w { z16.s }, p2/Z, [x25]\n"
+      "zip1 z23.s, z19.s, z16.s\n"
+      "ld1w { z17.s }, p2/Z, [x25, #1, MUL VL]\n"
+      "cmp x22, x21\n"
+      "zip2 z9.s, z19.s, z16.s\n"
+      "ld1w { z20.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x25, #3, MUL VL]\n"
+      "zip1 z8.s, z18.s, z17.s\n"
+      "ld1w { z16.s }, p2/Z, [x25, #4, MUL VL]\n"
+      "zip2 z7.s, z18.s, z17.s\n"
+      "ld1w { z18.s }, p2/Z, [x25, #5, MUL VL]\n"
+      "addvl x25, x25, #6\n"
+      "zip1 z6.s, z21.s, z20.s\n"
+      "ld1w { z17.s }, p2/Z, [x24]\n"
+      "zip2 z5.s, z21.s, z20.s\n"
+      "ld1w { z22.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "zip1 z4.s, z26.s, z19.s\n"
+      "ld1w { z21.s }, p2/Z, [x24, #2, MUL VL]\n"
+      "zip2 z3.s, z26.s, z19.s\n"
+      "ld1w { z2.s }, p2/Z, [x24, #3, MUL VL]\n"
+      "zip1 z1.s, z25.s, z16.s\n"
+      "ld1w { z0.s }, p2/Z, [x24, #4, MUL VL]\n"
+      "zip2 z31.s, z25.s, z16.s\n"
+      "ld1w { z30.s }, p2/Z, [x24, #5, MUL VL]\n"
+      "addvl x24, x24, #6\n"
+      "zip1 z29.s, z24.s, z18.s\n"
+      "ld1w { z16.s }, p2/Z, [x23]\n"
+      "zip2 z28.s, z24.s, z18.s\n"
+      "ld1w { z20.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip1 z27.s, z17.s, z16.s\n"
+      "ld1w { z18.s }, p2/Z, [x23, #3, MUL VL]\n"
+      "zip2 z26.s, z17.s, z16.s\n"
+      "ld1w { z17.s }, p2/Z, [x23, #4, MUL VL]\n"
+      "zip1 z25.s, z22.s, z20.s\n"
+      "ld1w { z16.s }, p2/Z, [x23, #5, MUL VL]\n"
+      "addvl x23, x23, #6\n"
+      "zip2 z24.s, z22.s, z20.s\n"
+      "st1w { z23.s }, p2, [x20]\n"
+      "zip1 z23.s, z21.s, z19.s\n"
+      "st1w { z9.s }, p2, [x20, #1, MUL VL]\n"
+      "zip2 z22.s, z21.s, z19.s\n"
+      "st1w { z8.s }, p2, [x20, #2, MUL VL]\n"
+      "zip1 z21.s, z2.s, z18.s\n"
+      "st1w { z7.s }, p2, [x20, #3, MUL VL]\n"
+      "zip2 z20.s, z2.s, z18.s\n"
+      "st1w { z6.s }, p2, [x20, #4, MUL VL]\n"
+      "zip1 z19.s, z0.s, z17.s\n"
+      "st1w { z5.s }, p2, [x20, #5, MUL VL]\n"
+      "zip2 z18.s, z0.s, z17.s\n"
+      "st1w { z27.s }, p2, [x20, #6, MUL VL]\n"
+      "zip1 z17.s, z30.s, z16.s\n"
+      "st1w { z26.s }, p2, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #12\n"
+      "zip2 z16.s, z30.s, z16.s\n"
+      "st1w { z25.s }, p2, [x20, #-4, MUL VL]\n"
+      "st1w { z24.s }, p2, [x20, #-3, MUL VL]\n"
+      "st1w { z23.s }, p2, [x20, #-2, MUL VL]\n"
+      "st1w { z22.s }, p2, [x20, #-1, MUL VL]\n"
+      "st1w { z4.s }, p2, [x19]\n"
+      "st1w { z3.s }, p2, [x19, #1, MUL VL]\n"
+      "st1w { z1.s }, p2, [x19, #2, MUL VL]\n"
+      "st1w { z31.s }, p2, [x19, #3, MUL VL]\n"
+      "st1w { z29.s }, p2, [x19, #4, MUL VL]\n"
+      "st1w { z28.s }, p2, [x19, #5, MUL VL]\n"
+      "st1w { z21.s }, p2, [x19, #6, MUL VL]\n"
+      "st1w { z20.s }, p2, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #12\n"
+      "st1w { z19.s }, p2, [x19, #-4, MUL VL]\n"
+      "st1w { z18.s }, p2, [x19, #-3, MUL VL]\n"
+      "st1w { z17.s }, p2, [x19, #-2, MUL VL]\n"
+      "st1w { z16.s }, p2, [x19, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x22, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x22\n"
+      "mov x19, x26\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z18.s }, p0/Z, [x27]\n"
+      "ld1w { z16.s }, p0/Z, [x25]\n"
+      "zip1 z28.s, z18.s, z16.s\n"
+      "ld1w { z17.s }, p0/Z, [x24]\n"
+      "decw x20\n"
+      "zip2 z27.s, z18.s, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x23]\n"
+      "whilelt p1.s, XZR, x20\n"
+      "zip1 z26.s, z17.s, z16.s\n"
+      "ld1w { z18.s }, p1/Z, [x27, #1, MUL VL]\n"
+      "decw x20\n"
+      "zip2 z25.s, z17.s, z16.s\n"
+      "ld1w { z16.s }, p1/Z, [x25, #1, MUL VL]\n"
+      "whilelt p0.s, XZR, x20\n"
+      "zip1 z24.s, z18.s, z16.s\n"
+      "ld1w { z17.s }, p0/Z, [x27, #2, MUL VL]\n"
+      "addvl x27, x27, #3\n"
+      "zip2 z23.s, z18.s, z16.s\n"
+      "ld1w { z16.s }, p0/Z, [x25, #2, MUL VL]\n"
+      "addvl x25, x25, #3\n"
+      "zip1 z22.s, z17.s, z16.s\n"
+      "ld1w { z18.s }, p1/Z, [x24, #1, MUL VL]\n"
+      "add x26, x26, %x[out_stride]\n"
+      "zip2 z21.s, z17.s, z16.s\n"
+      "ld1w { z20.s }, p0/Z, [x24, #2, MUL VL]\n"
+      "addvl x24, x24, #3\n"
+      "ld1w { z17.s }, p1/Z, [x23, #1, MUL VL]\n"
+      "zip1 z19.s, z18.s, z17.s\n"
+      "ld1w { z16.s }, p0/Z, [x23, #2, MUL VL]\n"
+      "addvl x23, x23, #3\n"
+      "zip2 z18.s, z18.s, z17.s\n"
+      "st1w { z28.s }, p2, [x19]\n"
+      "decd x22, ALL, MUL #6\n"
+      "zip1 z17.s, z20.s, z16.s\n"
+      "st1w { z27.s }, p2, [x19, #1, MUL VL]\n"
+      "cmp x22, #0x0\n"
+      "zip2 z16.s, z20.s, z16.s\n"
+      "st1w { z24.s }, p2, [x19, #2, MUL VL]\n"
+      "st1w { z23.s }, p2, [x19, #3, MUL VL]\n"
+      "st1w { z22.s }, p2, [x19, #4, MUL VL]\n"
+      "st1w { z21.s }, p2, [x19, #5, MUL VL]\n"
+      "st1w { z26.s }, p2, [x19, #6, MUL VL]\n"
+      "st1w { z25.s }, p2, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #12\n"
+      "st1w { z19.s }, p2, [x19, #-4, MUL VL]\n"
+      "st1w { z18.s }, p2, [x19, #-3, MUL VL]\n"
+      "st1w { z17.s }, p2, [x19, #-2, MUL VL]\n"
+      "st1w { z16.s }, p2, [x19, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #12\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+
+      "7:"  // Tail row loop: Head
+      "mov x27, %x[in]\n"
+      "mov x26, %x[out]\n"
+      "add x25, x27, %x[in_stride]\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x20, %x[width]\n"
+      "cntw x19, ALL, MUL #6\n"
+      "cmp x20, x19\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1w { z19.s }, p2/Z, [x27]\n"
+      "sub x20, x20, x19\n"
+      "ld1w { z18.s }, p2/Z, [x27, #1, MUL VL]\n"
+      "cmp x20, x19\n"
+      "ld1w { z29.s }, p2/Z, [x27, #2, MUL VL]\n"
+      "ld1w { z28.s }, p2/Z, [x27, #3, MUL VL]\n"
+      "ld1w { z27.s }, p2/Z, [x27, #4, MUL VL]\n"
+      "ld1w { z26.s }, p2/Z, [x27, #5, MUL VL]\n"
+      "addvl x27, x27, #6\n"
+      "ld1w { z16.s }, p2/Z, [x25]\n"
+      "zip1 z25.s, z19.s, z16.s\n"
+      "ld1w { z17.s }, p2/Z, [x25, #1, MUL VL]\n"
+      "zip2 z24.s, z19.s, z16.s\n"
+      "ld1w { z16.s }, p2/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z23.s }, p2/Z, [x25, #3, MUL VL]\n"
+      "zip1 z20.s, z18.s, z17.s\n"
+      "ld1w { z22.s }, p2/Z, [x25, #4, MUL VL]\n"
+      "zip2 z19.s, z18.s, z17.s\n"
+      "ld1w { z21.s }, p2/Z, [x25, #5, MUL VL]\n"
+      "addvl x25, x25, #6\n"
+      "zip1 z18.s, z29.s, z16.s\n"
+      "st1w { z25.s }, p2, [x26]\n"
+      "zip2 z17.s, z29.s, z16.s\n"
+      "st1w { z24.s }, p2, [x26, #1, MUL VL]\n"
+      "zip1 z16.s, z28.s, z23.s\n"
+      "st1w { z20.s }, p2, [x26, #2, MUL VL]\n"
+      "zip2 z20.s, z28.s, z23.s\n"
+      "st1w { z19.s }, p2, [x26, #3, MUL VL]\n"
+      "zip1 z19.s, z27.s, z22.s\n"
+      "st1w { z18.s }, p2, [x26, #4, MUL VL]\n"
+      "zip2 z18.s, z27.s, z22.s\n"
+      "st1w { z17.s }, p2, [x26, #5, MUL VL]\n"
+      "add x26, x26, %x[out_stride]\n"
+      "zip1 z17.s, z26.s, z21.s\n"
+      "st1w { z16.s }, p2, [x26]\n"
+      "zip2 z16.s, z26.s, z21.s\n"
+      "st1w { z20.s }, p2, [x26, #1, MUL VL]\n"
+      "st1w { z19.s }, p2, [x26, #2, MUL VL]\n"
+      "st1w { z18.s }, p2, [x26, #3, MUL VL]\n"
+      "st1w { z17.s }, p2, [x26, #4, MUL VL]\n"
+      "st1w { z16.s }, p2, [x26, #5, MUL VL]\n"
+      "add x26, x26, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x20, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x19, x20\n"
+      "decd x20, ALL, MUL #6\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z17.s }, p0/Z, [x27]\n"
+      "ld1w { z16.s }, p0/Z, [x25]\n"
+      "zip1 z22.s, z17.s, z16.s\n"
+      "decw x19\n"
+      "zip2 z21.s, z17.s, z16.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z17.s }, p0/Z, [x27, #1, MUL VL]\n"
+      "decw x19\n"
+      "ld1w { z16.s }, p0/Z, [x25, #1, MUL VL]\n"
+      "zip1 z20.s, z17.s, z16.s\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z19.s }, p0/Z, [x27, #2, MUL VL]\n"
+      "zip2 z18.s, z17.s, z16.s\n"
+      "addvl x27, x27, #3\n"
+      "ld1w { z16.s }, p0/Z, [x25, #2, MUL VL]\n"
+      "zip1 z17.s, z19.s, z16.s\n"
+      "st1w { z22.s }, p2, [x26]\n"
+      "addvl x25, x25, #3\n"
+      "zip2 z16.s, z19.s, z16.s\n"
+      "st1w { z21.s }, p2, [x26, #1, MUL VL]\n"
+      "cmp x20, #0x0\n"
+      "st1w { z20.s }, p2, [x26, #2, MUL VL]\n"
+      "st1w { z18.s }, p2, [x26, #3, MUL VL]\n"
+      "st1w { z17.s }, p2, [x26, #4, MUL VL]\n"
+      "st1w { z16.s }, p2, [x26, #5, MUL VL]\n"
+      "add x26, x26, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "addvl %x[out], %x[out], #6\n"
+      "cmp %x[height], #0x1\n"
+      "bge 7b\n"
+      "12:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<6, 2, true, VLType::SVE>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_6VL_4x2(
+        reinterpret_cast<uint32_t *>(out),
+        reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 4,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
new file mode 100644
index 0000000000..56b7ed6eda
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL.hpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_8VL(uint32_t *out, const uint32_t *in, size_t width, size_t in_stride, size_t height)
+{
+    size_t out_stride = 8 * height * get_vector_length<uint8_t>();
+
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "cmp %x[height], #0x2\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[out]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x22, %x[width]\n"
+      "cntw x21, ALL, MUL #16\n"
+      "cmp x22, x21\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z15.s }, p1/Z, [x25]\n"
+      "mov x20, x24\n"
+      "ld1w { z14.s }, p1/Z, [x25, #1, MUL VL]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "ld1w { z13.s }, p1/Z, [x25, #2, MUL VL]\n"
+      "mov x19, x24\n"
+      "ld1w { z12.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "ld1w { z11.s }, p1/Z, [x25, #4, MUL VL]\n"
+      "sub x22, x22, x21\n"
+      "ld1w { z10.s }, p1/Z, [x25, #5, MUL VL]\n"
+      "cmp x22, x21\n"
+      "ld1w { z9.s }, p1/Z, [x25, #6, MUL VL]\n"
+      "ld1w { z8.s }, p1/Z, [x25, #7, MUL VL]\n"
+      "addvl x25, x25, #16\n"
+      "ld1w { z7.s }, p1/Z, [x23]\n"
+      "ld1w { z6.s }, p1/Z, [x25, #-8, MUL VL]\n"
+      "ld1w { z5.s }, p1/Z, [x25, #-7, MUL VL]\n"
+      "ld1w { z4.s }, p1/Z, [x25, #-6, MUL VL]\n"
+      "ld1w { z3.s }, p1/Z, [x25, #-5, MUL VL]\n"
+      "ld1w { z2.s }, p1/Z, [x25, #-4, MUL VL]\n"
+      "ld1w { z1.s }, p1/Z, [x25, #-3, MUL VL]\n"
+      "ld1w { z0.s }, p1/Z, [x25, #-2, MUL VL]\n"
+      "ld1w { z31.s }, p1/Z, [x25, #-1, MUL VL]\n"
+      "ld1w { z30.s }, p1/Z, [x23, #1, MUL VL]\n"
+      "ld1w { z29.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z28.s }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x23, #4, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x23, #5, MUL VL]\n"
+      "ld1w { z25.s }, p1/Z, [x23, #6, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x23, #7, MUL VL]\n"
+      "addvl x23, x23, #16\n"
+      "ld1w { z23.s }, p1/Z, [x23, #-8, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x23, #-7, MUL VL]\n"
+      "ld1w { z21.s }, p1/Z, [x23, #-6, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x23, #-5, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x23, #-4, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x23, #-3, MUL VL]\n"
+      "ld1w { z17.s }, p1/Z, [x23, #-2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x23, #-1, MUL VL]\n"
+      "st1w { z15.s }, p1, [x20]\n"
+      "st1w { z14.s }, p1, [x20, #1, MUL VL]\n"
+      "st1w { z13.s }, p1, [x20, #2, MUL VL]\n"
+      "st1w { z12.s }, p1, [x20, #3, MUL VL]\n"
+      "st1w { z11.s }, p1, [x20, #4, MUL VL]\n"
+      "st1w { z10.s }, p1, [x20, #5, MUL VL]\n"
+      "st1w { z9.s }, p1, [x20, #6, MUL VL]\n"
+      "st1w { z8.s }, p1, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "st1w { z7.s }, p1, [x20, #-8, MUL VL]\n"
+      "st1w { z30.s }, p1, [x20, #-7, MUL VL]\n"
+      "st1w { z29.s }, p1, [x20, #-6, MUL VL]\n"
+      "st1w { z28.s }, p1, [x20, #-5, MUL VL]\n"
+      "st1w { z27.s }, p1, [x20, #-4, MUL VL]\n"
+      "st1w { z26.s }, p1, [x20, #-3, MUL VL]\n"
+      "st1w { z25.s }, p1, [x20, #-2, MUL VL]\n"
+      "st1w { z24.s }, p1, [x20, #-1, MUL VL]\n"
+      "st1w { z6.s }, p1, [x19]\n"
+      "st1w { z5.s }, p1, [x19, #1, MUL VL]\n"
+      "st1w { z4.s }, p1, [x19, #2, MUL VL]\n"
+      "st1w { z3.s }, p1, [x19, #3, MUL VL]\n"
+      "st1w { z2.s }, p1, [x19, #4, MUL VL]\n"
+      "st1w { z1.s }, p1, [x19, #5, MUL VL]\n"
+      "st1w { z0.s }, p1, [x19, #6, MUL VL]\n"
+      "st1w { z31.s }, p1, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #16\n"
+      "st1w { z23.s }, p1, [x19, #-8, MUL VL]\n"
+      "st1w { z22.s }, p1, [x19, #-7, MUL VL]\n"
+      "st1w { z21.s }, p1, [x19, #-6, MUL VL]\n"
+      "st1w { z20.s }, p1, [x19, #-5, MUL VL]\n"
+      "st1w { z19.s }, p1, [x19, #-4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x19, #-3, MUL VL]\n"
+      "st1w { z17.s }, p1, [x19, #-2, MUL VL]\n"
+      "st1w { z16.s }, p1, [x19, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x22, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x22\n"
+      "mov x19, x24\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z31.s }, p0/Z, [x25]\n"
+      "ld1w { z30.s }, p0/Z, [x23]\n"
+      "decw x20\n"
+      "add x24, x24, %x[out_stride]\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z29.s }, p0/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z28.s }, p0/Z, [x23, #1, MUL VL]\n"
+      "decw x20\n"
+      "decw x22, ALL, MUL #8\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z27.s }, p0/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z26.s }, p0/Z, [x23, #2, MUL VL]\n"
+      "decw x20\n"
+      "whilelt p0.s, XZR, x20\n"
+      "ld1w { z25.s }, p0/Z, [x25, #3, MUL VL]\n"
+      "decw x20\n"
+      "ld1w { z24.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "whilelt p0.s, XZR, x20\n"
+      "decw x20\n"
+      "ld1w { z23.s }, p0/Z, [x25, #4, MUL VL]\n"
+      "ld1w { z22.s }, p0/Z, [x23, #4, MUL VL]\n"
+      "whilelt p0.s, XZR, x20\n"
+      "decw x20\n"
+      "ld1w { z21.s }, p0/Z, [x25, #5, MUL VL]\n"
+      "ld1w { z20.s }, p0/Z, [x23, #5, MUL VL]\n"
+      "whilelt p0.s, XZR, x20\n"
+      "decw x20\n"
+      "ld1w { z19.s }, p0/Z, [x25, #6, MUL VL]\n"
+      "ld1w { z18.s }, p0/Z, [x23, #6, MUL VL]\n"
+      "whilelt p0.s, XZR, x20\n"
+      "cmp x22, #0x0\n"
+      "ld1w { z17.s }, p0/Z, [x25, #7, MUL VL]\n"
+      "ld1w { z16.s }, p0/Z, [x23, #7, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "st1w { z31.s }, p1, [x19]\n"
+      "addvl x23, x23, #8\n"
+      "st1w { z29.s }, p1, [x19, #1, MUL VL]\n"
+      "st1w { z27.s }, p1, [x19, #2, MUL VL]\n"
+      "st1w { z25.s }, p1, [x19, #3, MUL VL]\n"
+      "st1w { z23.s }, p1, [x19, #4, MUL VL]\n"
+      "st1w { z21.s }, p1, [x19, #5, MUL VL]\n"
+      "st1w { z19.s }, p1, [x19, #6, MUL VL]\n"
+      "st1w { z17.s }, p1, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #16\n"
+      "st1w { z30.s }, p1, [x19, #-8, MUL VL]\n"
+      "st1w { z28.s }, p1, [x19, #-7, MUL VL]\n"
+      "st1w { z26.s }, p1, [x19, #-6, MUL VL]\n"
+      "st1w { z24.s }, p1, [x19, #-5, MUL VL]\n"
+      "st1w { z22.s }, p1, [x19, #-4, MUL VL]\n"
+      "st1w { z20.s }, p1, [x19, #-3, MUL VL]\n"
+      "st1w { z18.s }, p1, [x19, #-2, MUL VL]\n"
+      "st1w { z16.s }, p1, [x19, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #16\n"
+      "cmp %x[height], #0x2\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+
+      "7:"  // Tail row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[out]\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x1\n"
+      "mov x20, %x[width]\n"
+      "cntw x19, ALL, MUL #16\n"
+      "cmp x20, x19\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1w { z31.s }, p1/Z, [x25]\n"
+      "sub x20, x20, x19\n"
+      "ld1w { z30.s }, p1/Z, [x25, #1, MUL VL]\n"
+      "cmp x20, x19\n"
+      "ld1w { z29.s }, p1/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z28.s }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z27.s }, p1/Z, [x25, #4, MUL VL]\n"
+      "ld1w { z26.s }, p1/Z, [x25, #5, MUL VL]\n"
+      "ld1w { z25.s }, p1/Z, [x25, #6, MUL VL]\n"
+      "ld1w { z24.s }, p1/Z, [x25, #7, MUL VL]\n"
+      "addvl x25, x25, #16\n"
+      "ld1w { z23.s }, p1/Z, [x25, #-8, MUL VL]\n"
+      "ld1w { z22.s }, p1/Z, [x25, #-7, MUL VL]\n"
+      "ld1w { z21.s }, p1/Z, [x25, #-6, MUL VL]\n"
+      "ld1w { z20.s }, p1/Z, [x25, #-5, MUL VL]\n"
+      "ld1w { z19.s }, p1/Z, [x25, #-4, MUL VL]\n"
+      "ld1w { z18.s }, p1/Z, [x25, #-3, MUL VL]\n"
+      "ld1w { z17.s }, p1/Z, [x25, #-2, MUL VL]\n"
+      "ld1w { z16.s }, p1/Z, [x25, #-1, MUL VL]\n"
+      "st1w { z31.s }, p1, [x24]\n"
+      "st1w { z30.s }, p1, [x24, #1, MUL VL]\n"
+      "st1w { z29.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z28.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z27.s }, p1, [x24, #4, MUL VL]\n"
+      "st1w { z26.s }, p1, [x24, #5, MUL VL]\n"
+      "st1w { z25.s }, p1, [x24, #6, MUL VL]\n"
+      "st1w { z24.s }, p1, [x24, #7, MUL VL]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "st1w { z23.s }, p1, [x24]\n"
+      "st1w { z22.s }, p1, [x24, #1, MUL VL]\n"
+      "st1w { z21.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z20.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x24, #5, MUL VL]\n"
+      "st1w { z17.s }, p1, [x24, #6, MUL VL]\n"
+      "st1w { z16.s }, p1, [x24, #7, MUL VL]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x20, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x19, x20\n"
+      "decw x20, ALL, MUL #8\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z23.s }, p0/Z, [x25]\n"
+      "decw x19\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z22.s }, p0/Z, [x25, #1, MUL VL]\n"
+      "decw x19\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z21.s }, p0/Z, [x25, #2, MUL VL]\n"
+      "decw x19\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z20.s }, p0/Z, [x25, #3, MUL VL]\n"
+      "decw x19\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z19.s }, p0/Z, [x25, #4, MUL VL]\n"
+      "decw x19\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z18.s }, p0/Z, [x25, #5, MUL VL]\n"
+      "decw x19\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z17.s }, p0/Z, [x25, #6, MUL VL]\n"
+      "decw x19\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z16.s }, p0/Z, [x25, #7, MUL VL]\n"
+      "st1w { z23.s }, p1, [x24]\n"
+      "addvl x25, x25, #8\n"
+      "st1w { z22.s }, p1, [x24, #1, MUL VL]\n"
+      "cmp x20, #0x0\n"
+      "st1w { z21.s }, p1, [x24, #2, MUL VL]\n"
+      "st1w { z20.s }, p1, [x24, #3, MUL VL]\n"
+      "st1w { z19.s }, p1, [x24, #4, MUL VL]\n"
+      "st1w { z18.s }, p1, [x24, #5, MUL VL]\n"
+      "st1w { z17.s }, p1, [x24, #6, MUL VL]\n"
+      "st1w { z16.s }, p1, [x24, #7, MUL VL]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "addvl %x[out], %x[out], #8\n"
+      "cmp %x[height], #0x1\n"
+      "bge 7b\n"
+      "12:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 1, true, VLType::SVE>(
+    float *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL(
+        reinterpret_cast<uint32_t *>(out),
+        reinterpret_cast<const uint32_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(float) / 4,
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
new file mode 100644
index 0000000000..f81098b26e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x4.hpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_8VL_1x4(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "mov x24, %x[out]\n"
+      "add x23, x25, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add %x[in], x21, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x21, x21, %x[pad_row], GT\n"
+      "csel x22, x22, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x20, %x[width]\n"
+      "cntb x19, ALL, MUL #8\n"
+      "cmp x20, x19\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z8.b }, p1/Z, [x25]\n"
+      "sub x20, x20, x19\n"
+      "ld1b { z24.b }, p1/Z, [x25, #1, MUL VL]\n"
+      "cmp x20, x19\n"
+      "ld1b { z27.b }, p1/Z, [x25, #2, MUL VL]\n"
+      "ld1b { z25.b }, p1/Z, [x25, #3, MUL VL]\n"
+      "ld1b { z7.b }, p1/Z, [x25, #4, MUL VL]\n"
+      "ld1b { z3.b }, p1/Z, [x25, #5, MUL VL]\n"
+      "ld1b { z14.b }, p1/Z, [x25, #6, MUL VL]\n"
+      "ld1b { z13.b }, p1/Z, [x25, #7, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "ld1b { z16.b }, p1/Z, [x23]\n"
+      "ld1b { z12.b }, p1/Z, [x23, #1, MUL VL]\n"
+      "ld1b { z15.b }, p1/Z, [x23, #2, MUL VL]\n"
+      "ld1b { z11.b }, p1/Z, [x23, #3, MUL VL]\n"
+      "ld1b { z4.b }, p1/Z, [x23, #4, MUL VL]\n"
+      "ld1b { z5.b }, p1/Z, [x23, #5, MUL VL]\n"
+      "ld1b { z26.b }, p1/Z, [x23, #6, MUL VL]\n"
+      "ld1b { z30.b }, p1/Z, [x23, #7, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "ld1b { z22.b }, p1/Z, [x22]\n"
+      "zip1 z21.b, z8.b, z22.b\n"
+      "ld1b { z2.b }, p1/Z, [x22, #1, MUL VL]\n"
+      "zip2 z20.b, z8.b, z22.b\n"
+      "ld1b { z18.b }, p1/Z, [x22, #2, MUL VL]\n"
+      "ld1b { z17.b }, p1/Z, [x22, #3, MUL VL]\n"
+      "zip1 z10.b, z24.b, z2.b\n"
+      "ld1b { z22.b }, p1/Z, [x22, #4, MUL VL]\n"
+      "zip2 z9.b, z24.b, z2.b\n"
+      "ld1b { z6.b }, p1/Z, [x22, #5, MUL VL]\n"
+      "zip1 z0.b, z27.b, z18.b\n"
+      "ld1b { z1.b }, p1/Z, [x22, #6, MUL VL]\n"
+      "zip2 z28.b, z27.b, z18.b\n"
+      "ld1b { z23.b }, p1/Z, [x22, #7, MUL VL]\n"
+      "addvl x22, x22, #8\n"
+      "zip1 z31.b, z25.b, z17.b\n"
+      "ld1b { z19.b }, p1/Z, [x21]\n"
+      "zip2 z8.b, z25.b, z17.b\n"
+      "ld1b { z2.b }, p1/Z, [x21, #1, MUL VL]\n"
+      "zip1 z27.b, z7.b, z22.b\n"
+      "ld1b { z29.b }, p1/Z, [x21, #2, MUL VL]\n"
+      "zip2 z7.b, z7.b, z22.b\n"
+      "ld1b { z24.b }, p1/Z, [x21, #3, MUL VL]\n"
+      "zip1 z18.b, z16.b, z19.b\n"
+      "ld1b { z25.b }, p1/Z, [x21, #4, MUL VL]\n"
+      "zip1 z17.b, z21.b, z18.b\n"
+      "ld1b { z22.b }, p1/Z, [x21, #5, MUL VL]\n"
+      "zip2 z18.b, z21.b, z18.b\n"
+      "ld1b { z21.b }, p1/Z, [x21, #6, MUL VL]\n"
+      "zip2 z16.b, z16.b, z19.b\n"
+      "ld1b { z19.b }, p1/Z, [x21, #7, MUL VL]\n"
+      "addvl x21, x21, #8\n"
+      "st1b { z17.b }, p1, [x24]\n"
+      "zip1 z17.b, z20.b, z16.b\n"
+      "zip2 z20.b, z20.b, z16.b\n"
+      "st1b { z18.b }, p1, [x24, #1, MUL VL]\n"
+      "zip1 z16.b, z12.b, z2.b\n"
+      "st1b { z17.b }, p1, [x24, #2, MUL VL]\n"
+      "zip1 z17.b, z10.b, z16.b\n"
+      "st1b { z20.b }, p1, [x24, #3, MUL VL]\n"
+      "zip2 z16.b, z10.b, z16.b\n"
+      "st1b { z17.b }, p1, [x24, #4, MUL VL]\n"
+      "zip2 z17.b, z12.b, z2.b\n"
+      "st1b { z16.b }, p1, [x24, #5, MUL VL]\n"
+      "zip1 z16.b, z9.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #6, MUL VL]\n"
+      "zip2 z16.b, z9.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #7, MUL VL]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "zip1 z18.b, z15.b, z29.b\n"
+      "zip2 z17.b, z15.b, z29.b\n"
+      "zip1 z16.b, z0.b, z18.b\n"
+      "st1b { z16.b }, p1, [x24]\n"
+      "zip2 z16.b, z0.b, z18.b\n"
+      "st1b { z16.b }, p1, [x24, #1, MUL VL]\n"
+      "zip1 z16.b, z28.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #2, MUL VL]\n"
+      "zip2 z16.b, z28.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #3, MUL VL]\n"
+      "zip1 z17.b, z11.b, z24.b\n"
+      "zip1 z16.b, z31.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #4, MUL VL]\n"
+      "zip2 z16.b, z31.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #5, MUL VL]\n"
+      "zip2 z17.b, z11.b, z24.b\n"
+      "zip1 z16.b, z8.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #6, MUL VL]\n"
+      "zip2 z16.b, z8.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #7, MUL VL]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "zip1 z18.b, z4.b, z25.b\n"
+      "zip2 z17.b, z4.b, z25.b\n"
+      "zip1 z16.b, z27.b, z18.b\n"
+      "st1b { z16.b }, p1, [x24]\n"
+      "zip2 z16.b, z27.b, z18.b\n"
+      "st1b { z16.b }, p1, [x24, #1, MUL VL]\n"
+      "zip1 z16.b, z7.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #2, MUL VL]\n"
+      "zip2 z16.b, z7.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #3, MUL VL]\n"
+      "zip1 z18.b, z3.b, z6.b\n"
+      "zip1 z17.b, z5.b, z22.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #5, MUL VL]\n"
+      "zip2 z18.b, z3.b, z6.b\n"
+      "zip2 z17.b, z5.b, z22.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #6, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #7, MUL VL]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "zip1 z18.b, z14.b, z1.b\n"
+      "zip1 z17.b, z26.b, z21.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #1, MUL VL]\n"
+      "zip2 z18.b, z14.b, z1.b\n"
+      "zip2 z17.b, z26.b, z21.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #2, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #3, MUL VL]\n"
+      "zip1 z18.b, z13.b, z23.b\n"
+      "zip1 z17.b, z30.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #5, MUL VL]\n"
+      "zip2 z18.b, z13.b, z23.b\n"
+      "zip2 z17.b, z30.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #6, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #7, MUL VL]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x20, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x19, x20\n"
+      "decw x20, ALL, MUL #8\n"
+      "whilelt p0.b, XZR, x19\n"
+      "ld1b { z17.b }, p0/Z, [x25]\n"
+      "ld1b { z25.b }, p0/Z, [x23]\n"
+      "decb x19\n"
+      "ld1b { z16.b }, p0/Z, [x22]\n"
+      "zip1 z18.b, z17.b, z16.b\n"
+      "ld1b { z24.b }, p0/Z, [x21]\n"
+      "whilelt p0.b, XZR, x19\n"
+      "zip2 z23.b, z17.b, z16.b\n"
+      "ld1b { z22.b }, p0/Z, [x25, #1, MUL VL]\n"
+      "addvl x25, x25, #2\n"
+      "zip1 z16.b, z25.b, z24.b\n"
+      "ld1b { z21.b }, p0/Z, [x23, #1, MUL VL]\n"
+      "addvl x23, x23, #2\n"
+      "zip1 z17.b, z18.b, z16.b\n"
+      "ld1b { z20.b }, p0/Z, [x22, #1, MUL VL]\n"
+      "addvl x22, x22, #2\n"
+      "zip2 z18.b, z18.b, z16.b\n"
+      "ld1b { z19.b }, p0/Z, [x21, #1, MUL VL]\n"
+      "addvl x21, x21, #2\n"
+      "zip2 z16.b, z25.b, z24.b\n"
+      "st1b { z17.b }, p1, [x24]\n"
+      "cmp x20, #0x0\n"
+      "zip1 z17.b, z23.b, z16.b\n"
+      "st1b { z18.b }, p1, [x24, #1, MUL VL]\n"
+      "zip2 z16.b, z23.b, z16.b\n"
+      "st1b { z17.b }, p1, [x24, #2, MUL VL]\n"
+      "zip1 z18.b, z22.b, z20.b\n"
+      "st1b { z16.b }, p1, [x24, #3, MUL VL]\n"
+      "zip1 z17.b, z21.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #5, MUL VL]\n"
+      "zip2 z18.b, z22.b, z20.b\n"
+      "zip2 z17.b, z21.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #6, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x24, #7, MUL VL]\n"
+      "add x24, x24, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #8\n"
+      "cmp %x[height], #0x1\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 4, true, VLType::SVE>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<8, 4, true, VLType::SVE>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_1x4(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
new file mode 100644
index 0000000000..34d43f5052
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_1x8.hpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_8VL_1x8(uint8_t *out, const uint8_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint8_t *pad_row = reinterpret_cast<uint8_t *>(alloca(width * sizeof(uint8_t)));
+
+    if (height % 8) {
+        memset(pad_row, 0, width * sizeof(uint8_t));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 8) * get_vector_length<uint64_t>();
+
+    __asm__ __volatile__(
+      "ptrue p1.b\n"
+      "1:"  // Main row loop: Head
+      "mov x9, %x[in]\n"
+      "mov x28, %x[out]\n"
+      "add x27, x9, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add x22, x23, %x[in_stride]\n"
+      "add x21, x22, %x[in_stride]\n"
+      "add %x[in], x21, %x[in_stride]\n"
+      "cmp %x[height], #0x7\n"
+      "csel x21, x21, %x[pad_row], GT\n"
+      "csel x22, x22, %x[pad_row], GE\n"
+      "cmp %x[height], #0x5\n"
+      "csel x23, x23, %x[pad_row], GT\n"
+      "csel x24, x24, %x[pad_row], GE\n"
+      "cmp %x[height], #0x3\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "csel x26, x26, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x27, x27, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x20, %x[width]\n"
+      "cntb x19, ALL, MUL #2\n"
+      "cmp x20, x19\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1b { z17.b }, p1/Z, [x9]\n"
+      "sub x20, x20, x19\n"
+      "ld1b { z5.b }, p1/Z, [x9, #1, MUL VL]\n"
+      "addvl x9, x9, #2\n"
+      "ld1b { z19.b }, p1/Z, [x27]\n"
+      "cmp x20, x19\n"
+      "ld1b { z4.b }, p1/Z, [x27, #1, MUL VL]\n"
+      "addvl x27, x27, #2\n"
+      "ld1b { z18.b }, p1/Z, [x26]\n"
+      "ld1b { z3.b }, p1/Z, [x26, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "ld1b { z2.b }, p1/Z, [x25]\n"
+      "ld1b { z1.b }, p1/Z, [x25, #1, MUL VL]\n"
+      "addvl x25, x25, #2\n"
+      "ld1b { z16.b }, p1/Z, [x24]\n"
+      "zip1 z0.b, z17.b, z16.b\n"
+      "ld1b { z31.b }, p1/Z, [x24, #1, MUL VL]\n"
+      "addvl x24, x24, #2\n"
+      "zip2 z30.b, z17.b, z16.b\n"
+      "ld1b { z17.b }, p1/Z, [x23]\n"
+      "ld1b { z29.b }, p1/Z, [x23, #1, MUL VL]\n"
+      "zip1 z28.b, z5.b, z31.b\n"
+      "ld1b { z16.b }, p1/Z, [x22]\n"
+      "addvl x23, x23, #2\n"
+      "zip1 z27.b, z19.b, z17.b\n"
+      "ld1b { z26.b }, p1/Z, [x22, #1, MUL VL]\n"
+      "addvl x22, x22, #2\n"
+      "zip2 z25.b, z19.b, z17.b\n"
+      "ld1b { z24.b }, p1/Z, [x21]\n"
+      "zip1 z22.b, z4.b, z29.b\n"
+      "ld1b { z23.b }, p1/Z, [x21, #1, MUL VL]\n"
+      "addvl x21, x21, #2\n"
+      "zip1 z21.b, z18.b, z16.b\n"
+      "zip2 z20.b, z18.b, z16.b\n"
+      "zip1 z18.b, z0.b, z21.b\n"
+      "zip1 z19.b, z2.b, z24.b\n"
+      "zip1 z17.b, z27.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip2 z18.b, z0.b, z21.b\n"
+      "zip2 z17.b, z27.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip1 z18.b, z30.b, z20.b\n"
+      "zip2 z19.b, z2.b, z24.b\n"
+      "zip1 z17.b, z25.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "zip2 z18.b, z30.b, z20.b\n"
+      "zip2 z17.b, z25.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #6, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #7, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "zip1 z20.b, z3.b, z26.b\n"
+      "zip1 z19.b, z1.b, z23.b\n"
+      "zip1 z18.b, z28.b, z20.b\n"
+      "zip1 z17.b, z22.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip2 z18.b, z28.b, z20.b\n"
+      "zip2 z17.b, z22.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip2 z22.b, z5.b, z31.b\n"
+      "zip2 z21.b, z3.b, z26.b\n"
+      "zip1 z18.b, z22.b, z21.b\n"
+      "zip2 z20.b, z4.b, z29.b\n"
+      "zip2 z19.b, z1.b, z23.b\n"
+      "zip1 z17.b, z20.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "zip2 z18.b, z22.b, z21.b\n"
+      "zip2 z17.b, z20.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #6, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #7, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x20, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "whilelt p0.b, XZR, x20\n"
+      "ld1b { z18.b }, p0/Z, [x9]\n"
+      "addvl x9, x9, #1\n"
+      "ld1b { z28.b }, p0/Z, [x27]\n"
+      "addvl x27, x27, #1\n"
+      "ld1b { z17.b }, p0/Z, [x26]\n"
+      "addvl x26, x26, #1\n"
+      "ld1b { z27.b }, p0/Z, [x25]\n"
+      "addvl x25, x25, #1\n"
+      "ld1b { z16.b }, p0/Z, [x24]\n"
+      "zip1 z26.b, z18.b, z16.b\n"
+      "ld1b { z25.b }, p0/Z, [x23]\n"
+      "addvl x24, x24, #1\n"
+      "zip2 z24.b, z18.b, z16.b\n"
+      "ld1b { z16.b }, p0/Z, [x22]\n"
+      "addvl x23, x23, #1\n"
+      "zip1 z23.b, z28.b, z25.b\n"
+      "ld1b { z22.b }, p0/Z, [x21]\n"
+      "addvl x22, x22, #1\n"
+      "zip1 z20.b, z17.b, z16.b\n"
+      "addvl x21, x21, #1\n"
+      "zip2 z21.b, z17.b, z16.b\n"
+      "decd x20, ALL, MUL #8\n"
+      "zip1 z18.b, z26.b, z20.b\n"
+      "cmp x20, #0x0\n"
+      "zip1 z19.b, z27.b, z22.b\n"
+      "zip1 z17.b, z23.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #1, MUL VL]\n"
+      "zip2 z18.b, z26.b, z20.b\n"
+      "zip2 z17.b, z23.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #2, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #3, MUL VL]\n"
+      "zip1 z18.b, z24.b, z21.b\n"
+      "zip2 z20.b, z28.b, z25.b\n"
+      "zip2 z19.b, z27.b, z22.b\n"
+      "zip1 z17.b, z20.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #4, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #5, MUL VL]\n"
+      "zip2 z18.b, z24.b, z21.b\n"
+      "zip2 z17.b, z20.b, z19.b\n"
+      "zip1 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #6, MUL VL]\n"
+      "zip2 z16.b, z18.b, z17.b\n"
+      "st1b { z16.b }, p1, [x28, #7, MUL VL]\n"
+      "add x28, x28, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #8\n"
+      "cmp %x[height], #0x1\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 8, true, VLType::SVE>(
+    uint8_t *out, const uint8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(uint8_t) / 1,
+        stride * sizeof(uint8_t),
+        (kmax-k0)
+    );
+}
+
+template<>
+void Transform<8, 8, true, VLType::SVE>(
+    int8_t *out, const int8_t *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_1x8(
+        reinterpret_cast<uint8_t *>(out),
+        reinterpret_cast<const uint8_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(int8_t) / 1,
+        stride * sizeof(int8_t),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
new file mode 100644
index 0000000000..7124f7e909
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x2.hpp
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_8VL_2x2(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 2) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 2) * get_vector_length<uint16_t>();
+
+    __asm__ __volatile__(
+      "ptrue p3.b\n"
+      "cmp %x[height], #0x4\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x27, %x[in]\n"
+      "mov x26, %x[out]\n"
+      "add x25, x27, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x22, %x[width]\n"
+      "cnth x21, ALL, MUL #8\n"
+      "cmp x22, x21\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1h { z8.h }, p3/Z, [x27]\n"
+      "mov x20, x26\n"
+      "ld1h { z3.h }, p3/Z, [x27, #1, MUL VL]\n"
+      "add x26, x26, %x[out_stride]\n"
+      "ld1h { z22.h }, p3/Z, [x27, #2, MUL VL]\n"
+      "mov x19, x26\n"
+      "ld1h { z12.h }, p3/Z, [x27, #3, MUL VL]\n"
+      "add x26, x26, %x[out_stride]\n"
+      "ld1h { z4.h }, p3/Z, [x27, #4, MUL VL]\n"
+      "sub x22, x22, x21\n"
+      "ld1h { z25.h }, p3/Z, [x27, #5, MUL VL]\n"
+      "cmp x22, x21\n"
+      "ld1h { z15.h }, p3/Z, [x27, #6, MUL VL]\n"
+      "ld1h { z2.h }, p3/Z, [x27, #7, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      "ld1h { z16.h }, p3/Z, [x25]\n"
+      "zip1 z21.h, z8.h, z16.h\n"
+      "ld1h { z27.h }, p3/Z, [x25, #1, MUL VL]\n"
+      "zip2 z7.h, z8.h, z16.h\n"
+      "ld1h { z18.h }, p3/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z30.h }, p3/Z, [x25, #3, MUL VL]\n"
+      "zip1 z19.h, z3.h, z27.h\n"
+      "ld1h { z0.h }, p3/Z, [x25, #4, MUL VL]\n"
+      "zip2 z16.h, z3.h, z27.h\n"
+      "ld1h { z27.h }, p3/Z, [x25, #5, MUL VL]\n"
+      "zip1 z13.h, z22.h, z18.h\n"
+      "ld1h { z26.h }, p3/Z, [x25, #6, MUL VL]\n"
+      "zip2 z29.h, z22.h, z18.h\n"
+      "ld1h { z24.h }, p3/Z, [x25, #7, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "zip1 z20.h, z12.h, z30.h\n"
+      "ld1h { z9.h }, p3/Z, [x24]\n"
+      "zip2 z14.h, z12.h, z30.h\n"
+      "ld1h { z12.h }, p3/Z, [x24, #1, MUL VL]\n"
+      "zip1 z5.h, z4.h, z0.h\n"
+      "ld1h { z31.h }, p3/Z, [x24, #2, MUL VL]\n"
+      "zip2 z1.h, z4.h, z0.h\n"
+      "ld1h { z22.h }, p3/Z, [x24, #3, MUL VL]\n"
+      "zip1 z10.h, z25.h, z27.h\n"
+      "ld1h { z3.h }, p3/Z, [x24, #4, MUL VL]\n"
+      "zip2 z6.h, z25.h, z27.h\n"
+      "ld1h { z4.h }, p3/Z, [x24, #5, MUL VL]\n"
+      "zip1 z8.h, z15.h, z26.h\n"
+      "ld1h { z25.h }, p3/Z, [x24, #6, MUL VL]\n"
+      "zip2 z11.h, z15.h, z26.h\n"
+      "ld1h { z30.h }, p3/Z, [x24, #7, MUL VL]\n"
+      "addvl x24, x24, #8\n"
+      "zip1 z17.h, z2.h, z24.h\n"
+      "ld1h { z23.h }, p3/Z, [x23]\n"
+      "zip2 z0.h, z2.h, z24.h\n"
+      "ld1h { z28.h }, p3/Z, [x23, #1, MUL VL]\n"
+      "ld1h { z15.h }, p3/Z, [x23, #2, MUL VL]\n"
+      "zip1 z18.h, z9.h, z23.h\n"
+      "ld1h { z26.h }, p3/Z, [x23, #3, MUL VL]\n"
+      "zip2 z27.h, z9.h, z23.h\n"
+      "ld1h { z2.h }, p3/Z, [x23, #4, MUL VL]\n"
+      "zip1 z9.h, z12.h, z28.h\n"
+      "ld1h { z24.h }, p3/Z, [x23, #5, MUL VL]\n"
+      "zip2 z12.h, z12.h, z28.h\n"
+      "ld1h { z23.h }, p3/Z, [x23, #6, MUL VL]\n"
+      "zip1 z28.h, z31.h, z15.h\n"
+      "zip2 z31.h, z31.h, z15.h\n"
+      "ld1h { z15.h }, p3/Z, [x23, #7, MUL VL]\n"
+      "addvl x23, x23, #8\n"
+      "st1h { z21.h }, p3, [x20]\n"
+      "zip1 z21.h, z22.h, z26.h\n"
+      "zip2 z26.h, z22.h, z26.h\n"
+      "st1h { z7.h }, p3, [x20, #1, MUL VL]\n"
+      "zip1 z7.h, z3.h, z2.h\n"
+      "st1h { z19.h }, p3, [x20, #2, MUL VL]\n"
+      "zip2 z22.h, z3.h, z2.h\n"
+      "st1h { z16.h }, p3, [x20, #3, MUL VL]\n"
+      "zip1 z2.h, z4.h, z24.h\n"
+      "st1h { z13.h }, p3, [x20, #4, MUL VL]\n"
+      "zip2 z3.h, z4.h, z24.h\n"
+      "st1h { z29.h }, p3, [x20, #5, MUL VL]\n"
+      "zip1 z4.h, z25.h, z23.h\n"
+      "st1h { z20.h }, p3, [x20, #6, MUL VL]\n"
+      "zip2 z20.h, z25.h, z23.h\n"
+      "st1h { z14.h }, p3, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "zip1 z25.h, z30.h, z15.h\n"
+      "st1h { z18.h }, p3, [x20, #-8, MUL VL]\n"
+      "zip2 z18.h, z30.h, z15.h\n"
+      "st1h { z27.h }, p3, [x20, #-7, MUL VL]\n"
+      "st1h { z9.h }, p3, [x20, #-6, MUL VL]\n"
+      "st1h { z12.h }, p3, [x20, #-5, MUL VL]\n"
+      "st1h { z28.h }, p3, [x20, #-4, MUL VL]\n"
+      "st1h { z31.h }, p3, [x20, #-3, MUL VL]\n"
+      "st1h { z21.h }, p3, [x20, #-2, MUL VL]\n"
+      "st1h { z26.h }, p3, [x20, #-1, MUL VL]\n"
+      "st1h { z5.h }, p3, [x19]\n"
+      "st1h { z1.h }, p3, [x19, #1, MUL VL]\n"
+      "st1h { z10.h }, p3, [x19, #2, MUL VL]\n"
+      "st1h { z6.h }, p3, [x19, #3, MUL VL]\n"
+      "st1h { z8.h }, p3, [x19, #4, MUL VL]\n"
+      "st1h { z11.h }, p3, [x19, #5, MUL VL]\n"
+      "st1h { z17.h }, p3, [x19, #6, MUL VL]\n"
+      "st1h { z0.h }, p3, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #16\n"
+      "st1h { z7.h }, p3, [x19, #-8, MUL VL]\n"
+      "st1h { z22.h }, p3, [x19, #-7, MUL VL]\n"
+      "st1h { z2.h }, p3, [x19, #-6, MUL VL]\n"
+      "st1h { z3.h }, p3, [x19, #-5, MUL VL]\n"
+      "st1h { z4.h }, p3, [x19, #-4, MUL VL]\n"
+      "st1h { z20.h }, p3, [x19, #-3, MUL VL]\n"
+      "st1h { z25.h }, p3, [x19, #-2, MUL VL]\n"
+      "st1h { z18.h }, p3, [x19, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x22, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x22\n"
+      "mov x19, x26\n"
+      "whilelt p0.h, XZR, x20\n"
+      "ld1h { z17.h }, p0/Z, [x27]\n"
+      "ld1h { z16.h }, p0/Z, [x25]\n"
+      "zip1 z0.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p0/Z, [x24]\n"
+      "dech x20\n"
+      "zip2 z31.h, z17.h, z16.h\n"
+      "ld1h { z16.h }, p0/Z, [x23]\n"
+      "whilelt p2.h, XZR, x20\n"
+      "zip1 z30.h, z18.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x27, #1, MUL VL]\n"
+      "dech x20\n"
+      "zip2 z29.h, z18.h, z16.h\n"
+      "ld1h { z16.h }, p2/Z, [x25, #1, MUL VL]\n"
+      "whilelt p1.h, XZR, x20\n"
+      "zip1 z28.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p1/Z, [x27, #2, MUL VL]\n"
+      "dech x20\n"
+      "zip2 z27.h, z17.h, z16.h\n"
+      "ld1h { z16.h }, p1/Z, [x25, #2, MUL VL]\n"
+      "whilelt p0.h, XZR, x20\n"
+      "zip1 z26.h, z18.h, z16.h\n"
+      "ld1h { z17.h }, p0/Z, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "zip2 z25.h, z18.h, z16.h\n"
+      "ld1h { z16.h }, p0/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "zip1 z24.h, z17.h, z16.h\n"
+      "ld1h { z19.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "add x26, x26, %x[out_stride]\n"
+      "zip2 z23.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p1/Z, [x24, #2, MUL VL]\n"
+      "decw x22, ALL, MUL #8\n"
+      "ld1h { z22.h }, p0/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      "ld1h { z16.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "zip1 z21.h, z19.h, z16.h\n"
+      "ld1h { z17.h }, p1/Z, [x23, #2, MUL VL]\n"
+      "cmp x22, #0x0\n"
+      "zip2 z20.h, z19.h, z16.h\n"
+      "ld1h { z16.h }, p0/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "zip1 z19.h, z18.h, z17.h\n"
+      "st1h { z0.h }, p3, [x19]\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "st1h { z31.h }, p3, [x19, #1, MUL VL]\n"
+      "zip1 z17.h, z22.h, z16.h\n"
+      "st1h { z28.h }, p3, [x19, #2, MUL VL]\n"
+      "zip2 z16.h, z22.h, z16.h\n"
+      "st1h { z27.h }, p3, [x19, #3, MUL VL]\n"
+      "st1h { z26.h }, p3, [x19, #4, MUL VL]\n"
+      "st1h { z25.h }, p3, [x19, #5, MUL VL]\n"
+      "st1h { z24.h }, p3, [x19, #6, MUL VL]\n"
+      "st1h { z23.h }, p3, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #16\n"
+      "st1h { z30.h }, p3, [x19, #-8, MUL VL]\n"
+      "st1h { z29.h }, p3, [x19, #-7, MUL VL]\n"
+      "st1h { z21.h }, p3, [x19, #-6, MUL VL]\n"
+      "st1h { z20.h }, p3, [x19, #-5, MUL VL]\n"
+      "st1h { z19.h }, p3, [x19, #-4, MUL VL]\n"
+      "st1h { z18.h }, p3, [x19, #-3, MUL VL]\n"
+      "st1h { z17.h }, p3, [x19, #-2, MUL VL]\n"
+      "st1h { z16.h }, p3, [x19, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #16\n"
+      "cmp %x[height], #0x4\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+
+      "7:"  // Tail row loop: Head
+      "mov x27, %x[in]\n"
+      "mov x26, %x[out]\n"
+      "add x25, x27, %x[in_stride]\n"
+      "add %x[in], x25, %x[in_stride]\n"
+      "cmp %x[height], #0x1\n"
+      "csel x25, x25, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x2\n"
+      "mov x20, %x[width]\n"
+      "cnth x19, ALL, MUL #8\n"
+      "cmp x20, x19\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1h { z17.h }, p3/Z, [x27]\n"
+      "sub x20, x20, x19\n"
+      "ld1h { z20.h }, p3/Z, [x27, #1, MUL VL]\n"
+      "cmp x20, x19\n"
+      "ld1h { z19.h }, p3/Z, [x27, #2, MUL VL]\n"
+      "ld1h { z1.h }, p3/Z, [x27, #3, MUL VL]\n"
+      "ld1h { z0.h }, p3/Z, [x27, #4, MUL VL]\n"
+      "ld1h { z31.h }, p3/Z, [x27, #5, MUL VL]\n"
+      "ld1h { z30.h }, p3/Z, [x27, #6, MUL VL]\n"
+      "ld1h { z29.h }, p3/Z, [x27, #7, MUL VL]\n"
+      "addvl x27, x27, #8\n"
+      "ld1h { z16.h }, p3/Z, [x25]\n"
+      "zip1 z28.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p3/Z, [x25, #1, MUL VL]\n"
+      "zip2 z27.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p3/Z, [x25, #2, MUL VL]\n"
+      "ld1h { z16.h }, p3/Z, [x25, #3, MUL VL]\n"
+      "zip1 z26.h, z20.h, z18.h\n"
+      "ld1h { z22.h }, p3/Z, [x25, #4, MUL VL]\n"
+      "zip2 z21.h, z20.h, z18.h\n"
+      "ld1h { z25.h }, p3/Z, [x25, #5, MUL VL]\n"
+      "zip1 z20.h, z19.h, z17.h\n"
+      "ld1h { z24.h }, p3/Z, [x25, #6, MUL VL]\n"
+      "zip2 z19.h, z19.h, z17.h\n"
+      "ld1h { z23.h }, p3/Z, [x25, #7, MUL VL]\n"
+      "addvl x25, x25, #8\n"
+      "zip1 z18.h, z1.h, z16.h\n"
+      "st1h { z28.h }, p3, [x26]\n"
+      "zip2 z17.h, z1.h, z16.h\n"
+      "st1h { z27.h }, p3, [x26, #1, MUL VL]\n"
+      "zip1 z16.h, z0.h, z22.h\n"
+      "st1h { z26.h }, p3, [x26, #2, MUL VL]\n"
+      "zip2 z22.h, z0.h, z22.h\n"
+      "st1h { z21.h }, p3, [x26, #3, MUL VL]\n"
+      "zip1 z21.h, z31.h, z25.h\n"
+      "st1h { z20.h }, p3, [x26, #4, MUL VL]\n"
+      "zip2 z20.h, z31.h, z25.h\n"
+      "st1h { z19.h }, p3, [x26, #5, MUL VL]\n"
+      "zip1 z19.h, z30.h, z24.h\n"
+      "st1h { z18.h }, p3, [x26, #6, MUL VL]\n"
+      "zip2 z18.h, z30.h, z24.h\n"
+      "st1h { z17.h }, p3, [x26, #7, MUL VL]\n"
+      "add x26, x26, %x[out_stride]\n"
+      "zip1 z17.h, z29.h, z23.h\n"
+      "st1h { z16.h }, p3, [x26]\n"
+      "zip2 z16.h, z29.h, z23.h\n"
+      "st1h { z22.h }, p3, [x26, #1, MUL VL]\n"
+      "st1h { z21.h }, p3, [x26, #2, MUL VL]\n"
+      "st1h { z20.h }, p3, [x26, #3, MUL VL]\n"
+      "st1h { z19.h }, p3, [x26, #4, MUL VL]\n"
+      "st1h { z18.h }, p3, [x26, #5, MUL VL]\n"
+      "st1h { z17.h }, p3, [x26, #6, MUL VL]\n"
+      "st1h { z16.h }, p3, [x26, #7, MUL VL]\n"
+      "add x26, x26, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x20, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x19, x20\n"
+      "decw x20, ALL, MUL #8\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z17.h }, p0/Z, [x27]\n"
+      "ld1h { z16.h }, p0/Z, [x25]\n"
+      "zip1 z24.h, z17.h, z16.h\n"
+      "dech x19\n"
+      "zip2 z23.h, z17.h, z16.h\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z18.h }, p0/Z, [x27, #1, MUL VL]\n"
+      "dech x19\n"
+      "ld1h { z16.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "zip1 z22.h, z18.h, z16.h\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z17.h }, p0/Z, [x27, #2, MUL VL]\n"
+      "zip2 z21.h, z18.h, z16.h\n"
+      "dech x19\n"
+      "ld1h { z16.h }, p0/Z, [x25, #2, MUL VL]\n"
+      "zip1 z20.h, z17.h, z16.h\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z19.h }, p0/Z, [x27, #3, MUL VL]\n"
+      "zip2 z18.h, z17.h, z16.h\n"
+      "addvl x27, x27, #4\n"
+      "ld1h { z16.h }, p0/Z, [x25, #3, MUL VL]\n"
+      "zip1 z17.h, z19.h, z16.h\n"
+      "st1h { z24.h }, p3, [x26]\n"
+      "addvl x25, x25, #4\n"
+      "zip2 z16.h, z19.h, z16.h\n"
+      "st1h { z23.h }, p3, [x26, #1, MUL VL]\n"
+      "cmp x20, #0x0\n"
+      "st1h { z22.h }, p3, [x26, #2, MUL VL]\n"
+      "st1h { z21.h }, p3, [x26, #3, MUL VL]\n"
+      "st1h { z20.h }, p3, [x26, #4, MUL VL]\n"
+      "st1h { z18.h }, p3, [x26, #5, MUL VL]\n"
+      "st1h { z17.h }, p3, [x26, #6, MUL VL]\n"
+      "st1h { z16.h }, p3, [x26, #7, MUL VL]\n"
+      "add x26, x26, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "addvl %x[out], %x[out], #8\n"
+      "cmp %x[height], #0x1\n"
+      "bge 7b\n"
+      "12:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 2, true, VLType::SVE>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_2x2(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
new file mode 100644
index 0000000000..891e3abeb0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4.hpp
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_8VL_2x4(uint16_t *out, const uint16_t *in, size_t width, size_t in_stride, size_t height)
+{
+    uint16_t *pad_row = reinterpret_cast<uint16_t *>(alloca(width * sizeof(uint16_t)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(uint16_t));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p2.b\n"
+      "cmp %x[height], #0x8\n"
+      "blt 6f\n"
+      "1:"  // Main row loop: Head
+      "mov x11, %x[in]\n"
+      "mov x10, %x[out]\n"
+      "add x9, x11, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add x26, x27, %x[in_stride]\n"
+      "add x25, x26, %x[in_stride]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "add %x[in], x23, %x[in_stride]\n"
+      "sub %x[height], %x[height], #0x8\n"
+      "mov x22, %x[width]\n"
+      "cnth x21, ALL, MUL #4\n"
+      "cmp x22, x21\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1h { z21.h }, p2/Z, [x11]\n"
+      "mov x20, x10\n"
+      "ld1h { z24.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "ld1h { z8.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "mov x19, x10\n"
+      "ld1h { z11.h }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "ld1h { z3.h }, p2/Z, [x9]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "ld1h { z0.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "sub x22, x22, x21\n"
+      "ld1h { z18.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "cmp x22, x21\n"
+      "ld1h { z12.h }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "ld1h { z16.h }, p2/Z, [x28]\n"
+      "zip1 z22.h, z21.h, z16.h\n"
+      "ld1h { z19.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "zip2 z21.h, z21.h, z16.h\n"
+      "ld1h { z26.h }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z13.h }, p2/Z, [x28, #3, MUL VL]\n"
+      "zip1 z14.h, z24.h, z19.h\n"
+      "ld1h { z16.h }, p2/Z, [x27]\n"
+      "addvl x28, x28, #4\n"
+      "zip2 z24.h, z24.h, z19.h\n"
+      "ld1h { z27.h }, p2/Z, [x27, #1, MUL VL]\n"
+      "zip1 z17.h, z8.h, z26.h\n"
+      "ld1h { z15.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "zip2 z9.h, z8.h, z26.h\n"
+      "ld1h { z5.h }, p2/Z, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "zip1 z2.h, z3.h, z16.h\n"
+      "ld1h { z4.h }, p2/Z, [x26]\n"
+      "zip2 z16.h, z3.h, z16.h\n"
+      "ld1h { z23.h }, p2/Z, [x26, #1, MUL VL]\n"
+      "zip1 z20.h, z22.h, z2.h\n"
+      "ld1h { z10.h }, p2/Z, [x26, #2, MUL VL]\n"
+      "zip2 z28.h, z22.h, z2.h\n"
+      "ld1h { z8.h }, p2/Z, [x26, #3, MUL VL]\n"
+      "addvl x26, x26, #4\n"
+      "zip1 z26.h, z21.h, z16.h\n"
+      "ld1h { z25.h }, p2/Z, [x25]\n"
+      "zip2 z7.h, z21.h, z16.h\n"
+      "ld1h { z31.h }, p2/Z, [x25, #1, MUL VL]\n"
+      "zip1 z3.h, z0.h, z27.h\n"
+      "ld1h { z16.h }, p2/Z, [x25, #2, MUL VL]\n"
+      "zip1 z22.h, z14.h, z3.h\n"
+      "ld1h { z6.h }, p2/Z, [x25, #3, MUL VL]\n"
+      "addvl x25, x25, #4\n"
+      "zip2 z19.h, z14.h, z3.h\n"
+      "ld1h { z2.h }, p2/Z, [x24]\n"
+      "zip2 z14.h, z0.h, z27.h\n"
+      "ld1h { z21.h }, p2/Z, [x24, #1, MUL VL]\n"
+      "zip1 z29.h, z24.h, z14.h\n"
+      "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+      "zip2 z27.h, z24.h, z14.h\n"
+      "ld1h { z1.h }, p2/Z, [x24, #3, MUL VL]\n"
+      "addvl x24, x24, #4\n"
+      "zip1 z30.h, z4.h, z2.h\n"
+      "ld1h { z3.h }, p2/Z, [x23]\n"
+      "zip2 z14.h, z4.h, z2.h\n"
+      "ld1h { z4.h }, p2/Z, [x23, #1, MUL VL]\n"
+      "zip1 z2.h, z23.h, z21.h\n"
+      "ld1h { z24.h }, p2/Z, [x23, #2, MUL VL]\n"
+      "zip2 z21.h, z23.h, z21.h\n"
+      "ld1h { z23.h }, p2/Z, [x23, #3, MUL VL]\n"
+      "addvl x23, x23, #4\n"
+      "st1h { z20.h }, p2, [x20]\n"
+      "zip1 z20.h, z25.h, z3.h\n"
+      "zip2 z3.h, z25.h, z3.h\n"
+      "st1h { z28.h }, p2, [x20, #1, MUL VL]\n"
+      "zip1 z28.h, z30.h, z20.h\n"
+      "st1h { z26.h }, p2, [x20, #2, MUL VL]\n"
+      "zip2 z20.h, z30.h, z20.h\n"
+      "st1h { z7.h }, p2, [x20, #3, MUL VL]\n"
+      "zip1 z25.h, z14.h, z3.h\n"
+      "st1h { z22.h }, p2, [x20, #4, MUL VL]\n"
+      "zip2 z7.h, z14.h, z3.h\n"
+      "st1h { z19.h }, p2, [x20, #5, MUL VL]\n"
+      "zip1 z14.h, z31.h, z4.h\n"
+      "st1h { z29.h }, p2, [x20, #6, MUL VL]\n"
+      "zip1 z19.h, z2.h, z14.h\n"
+      "st1h { z27.h }, p2, [x20, #7, MUL VL]\n"
+      "addvl x20, x20, #16\n"
+      "zip2 z29.h, z2.h, z14.h\n"
+      "st1h { z28.h }, p2, [x20, #-8, MUL VL]\n"
+      "zip2 z27.h, z31.h, z4.h\n"
+      "st1h { z20.h }, p2, [x20, #-7, MUL VL]\n"
+      "zip1 z30.h, z21.h, z27.h\n"
+      "st1h { z25.h }, p2, [x20, #-6, MUL VL]\n"
+      "zip2 z20.h, z21.h, z27.h\n"
+      "st1h { z7.h }, p2, [x20, #-5, MUL VL]\n"
+      "zip1 z14.h, z18.h, z15.h\n"
+      "st1h { z19.h }, p2, [x20, #-4, MUL VL]\n"
+      "zip1 z19.h, z17.h, z14.h\n"
+      "st1h { z29.h }, p2, [x20, #-3, MUL VL]\n"
+      "zip2 z7.h, z17.h, z14.h\n"
+      "st1h { z30.h }, p2, [x20, #-2, MUL VL]\n"
+      "zip2 z14.h, z18.h, z15.h\n"
+      "st1h { z20.h }, p2, [x20, #-1, MUL VL]\n"
+      "zip1 z17.h, z9.h, z14.h\n"
+      "st1h { z19.h }, p2, [x19]\n"
+      "zip2 z27.h, z9.h, z14.h\n"
+      "st1h { z7.h }, p2, [x19, #1, MUL VL]\n"
+      "zip1 z18.h, z11.h, z13.h\n"
+      "st1h { z17.h }, p2, [x19, #2, MUL VL]\n"
+      "zip1 z17.h, z12.h, z5.h\n"
+      "st1h { z27.h }, p2, [x19, #3, MUL VL]\n"
+      "zip1 z20.h, z18.h, z17.h\n"
+      "st1h { z20.h }, p2, [x19, #4, MUL VL]\n"
+      "zip2 z18.h, z18.h, z17.h\n"
+      "st1h { z18.h }, p2, [x19, #5, MUL VL]\n"
+      "zip2 z18.h, z11.h, z13.h\n"
+      "zip2 z17.h, z12.h, z5.h\n"
+      "zip1 z29.h, z18.h, z17.h\n"
+      "st1h { z29.h }, p2, [x19, #6, MUL VL]\n"
+      "zip2 z17.h, z18.h, z17.h\n"
+      "st1h { z17.h }, p2, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #16\n"
+      "zip1 z18.h, z10.h, z0.h\n"
+      "zip1 z17.h, z16.h, z24.h\n"
+      "zip1 z30.h, z18.h, z17.h\n"
+      "st1h { z30.h }, p2, [x19, #-8, MUL VL]\n"
+      "zip2 z30.h, z18.h, z17.h\n"
+      "st1h { z30.h }, p2, [x19, #-7, MUL VL]\n"
+      "zip2 z18.h, z10.h, z0.h\n"
+      "zip2 z17.h, z16.h, z24.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-6, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-5, MUL VL]\n"
+      "zip1 z18.h, z8.h, z1.h\n"
+      "zip1 z17.h, z6.h, z23.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-4, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-3, MUL VL]\n"
+      "zip2 z18.h, z8.h, z1.h\n"
+      "zip2 z17.h, z6.h, z23.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-2, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x22, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x20, x22\n"
+      "mov x19, x10\n"
+      "whilelt p1.h, XZR, x20\n"
+      "ld1h { z17.h }, p1/Z, [x11]\n"
+      "ld1h { z22.h }, p1/Z, [x9]\n"
+      "dech x20\n"
+      "ld1h { z16.h }, p1/Z, [x28]\n"
+      "zip1 z19.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p1/Z, [x27]\n"
+      "whilelt p0.h, XZR, x20\n"
+      "zip2 z17.h, z17.h, z16.h\n"
+      "ld1h { z21.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "addvl x11, x11, #2\n"
+      "zip1 z16.h, z22.h, z18.h\n"
+      "ld1h { z2.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "addvl x9, x9, #2\n"
+      "zip1 z1.h, z19.h, z16.h\n"
+      "ld1h { z20.h }, p0/Z, [x28, #1, MUL VL]\n"
+      "addvl x28, x28, #2\n"
+      "zip2 z0.h, z19.h, z16.h\n"
+      "ld1h { z19.h }, p0/Z, [x27, #1, MUL VL]\n"
+      "addvl x27, x27, #2\n"
+      "zip2 z16.h, z22.h, z18.h\n"
+      "ld1h { z31.h }, p1/Z, [x26]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "zip1 z30.h, z17.h, z16.h\n"
+      "ld1h { z29.h }, p0/Z, [x26, #1, MUL VL]\n"
+      "addvl x26, x26, #2\n"
+      "zip2 z28.h, z17.h, z16.h\n"
+      "ld1h { z27.h }, p1/Z, [x25]\n"
+      "decd x22, ALL, MUL #8\n"
+      "zip1 z17.h, z21.h, z20.h\n"
+      "ld1h { z26.h }, p0/Z, [x25, #1, MUL VL]\n"
+      "addvl x25, x25, #2\n"
+      "zip1 z16.h, z2.h, z19.h\n"
+      "ld1h { z25.h }, p1/Z, [x24]\n"
+      "cmp x22, #0x0\n"
+      "zip1 z18.h, z17.h, z16.h\n"
+      "ld1h { z24.h }, p0/Z, [x24, #1, MUL VL]\n"
+      "addvl x24, x24, #2\n"
+      "zip2 z23.h, z17.h, z16.h\n"
+      "ld1h { z22.h }, p1/Z, [x23]\n"
+      "zip2 z17.h, z21.h, z20.h\n"
+      "ld1h { z21.h }, p0/Z, [x23, #1, MUL VL]\n"
+      "addvl x23, x23, #2\n"
+      "zip2 z16.h, z2.h, z19.h\n"
+      "st1h { z1.h }, p2, [x19]\n"
+      "zip1 z20.h, z31.h, z25.h\n"
+      "st1h { z0.h }, p2, [x19, #1, MUL VL]\n"
+      "zip1 z19.h, z17.h, z16.h\n"
+      "st1h { z30.h }, p2, [x19, #2, MUL VL]\n"
+      "zip2 z17.h, z17.h, z16.h\n"
+      "st1h { z28.h }, p2, [x19, #3, MUL VL]\n"
+      "zip1 z16.h, z27.h, z22.h\n"
+      "st1h { z18.h }, p2, [x19, #4, MUL VL]\n"
+      "zip1 z18.h, z20.h, z16.h\n"
+      "st1h { z23.h }, p2, [x19, #5, MUL VL]\n"
+      "zip2 z16.h, z20.h, z16.h\n"
+      "st1h { z19.h }, p2, [x19, #6, MUL VL]\n"
+      "zip2 z19.h, z31.h, z25.h\n"
+      "st1h { z17.h }, p2, [x19, #7, MUL VL]\n"
+      "addvl x19, x19, #16\n"
+      "zip2 z17.h, z27.h, z22.h\n"
+      "st1h { z18.h }, p2, [x19, #-8, MUL VL]\n"
+      "zip1 z18.h, z29.h, z24.h\n"
+      "st1h { z16.h }, p2, [x19, #-7, MUL VL]\n"
+      "zip1 z16.h, z19.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-6, MUL VL]\n"
+      "zip2 z16.h, z19.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-5, MUL VL]\n"
+      "zip1 z17.h, z26.h, z21.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-4, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-3, MUL VL]\n"
+      "zip2 z18.h, z29.h, z24.h\n"
+      "zip2 z17.h, z26.h, z21.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-2, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x19, #-1, MUL VL]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "addvl %x[out], %x[out], #16\n"
+      "cmp %x[height], #0x8\n"
+      "bge 1b\n"
+      "cbz %x[height], 12f\n"
+      "6:"  // Main loop skip
+
+      "7:"  // Tail row loop: Head
+      "mov x11, %x[in]\n"
+      "mov x10, %x[out]\n"
+      "add x9, x11, %x[in_stride]\n"
+      "add x28, x9, %x[in_stride]\n"
+      "add x27, x28, %x[in_stride]\n"
+      "add %x[in], x27, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "csel x27, x27, %x[pad_row], GT\n"
+      "csel x28, x28, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x9, x9, %x[pad_row], GT\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "mov x20, %x[width]\n"
+      "cnth x19, ALL, MUL #4\n"
+      "cmp x20, x19\n"
+      "blt 9f\n"
+      "8:"  // Tail row loop: Unroll column loop
+      "ld1h { z17.h }, p2/Z, [x11]\n"
+      "sub x20, x20, x19\n"
+      "ld1h { z20.h }, p2/Z, [x11, #1, MUL VL]\n"
+      "cmp x20, x19\n"
+      "ld1h { z19.h }, p2/Z, [x11, #2, MUL VL]\n"
+      "ld1h { z1.h }, p2/Z, [x11, #3, MUL VL]\n"
+      "addvl x11, x11, #4\n"
+      "ld1h { z0.h }, p2/Z, [x9]\n"
+      "ld1h { z31.h }, p2/Z, [x9, #1, MUL VL]\n"
+      "ld1h { z30.h }, p2/Z, [x9, #2, MUL VL]\n"
+      "ld1h { z29.h }, p2/Z, [x9, #3, MUL VL]\n"
+      "addvl x9, x9, #4\n"
+      "ld1h { z16.h }, p2/Z, [x28]\n"
+      "zip1 z28.h, z17.h, z16.h\n"
+      "ld1h { z18.h }, p2/Z, [x28, #1, MUL VL]\n"
+      "zip2 z27.h, z17.h, z16.h\n"
+      "ld1h { z17.h }, p2/Z, [x28, #2, MUL VL]\n"
+      "ld1h { z26.h }, p2/Z, [x28, #3, MUL VL]\n"
+      "zip1 z25.h, z20.h, z18.h\n"
+      "ld1h { z16.h }, p2/Z, [x27]\n"
+      "addvl x28, x28, #4\n"
+      "zip2 z24.h, z20.h, z18.h\n"
+      "ld1h { z23.h }, p2/Z, [x27, #1, MUL VL]\n"
+      "zip1 z22.h, z19.h, z17.h\n"
+      "ld1h { z21.h }, p2/Z, [x27, #2, MUL VL]\n"
+      "zip2 z20.h, z19.h, z17.h\n"
+      "ld1h { z19.h }, p2/Z, [x27, #3, MUL VL]\n"
+      "addvl x27, x27, #4\n"
+      "zip1 z18.h, z0.h, z16.h\n"
+      "zip2 z17.h, z0.h, z16.h\n"
+      "zip1 z16.h, z28.h, z18.h\n"
+      "st1h { z16.h }, p2, [x10]\n"
+      "zip2 z16.h, z28.h, z18.h\n"
+      "st1h { z16.h }, p2, [x10, #1, MUL VL]\n"
+      "zip1 z16.h, z27.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #2, MUL VL]\n"
+      "zip2 z16.h, z27.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #3, MUL VL]\n"
+      "zip1 z17.h, z31.h, z23.h\n"
+      "zip1 z16.h, z25.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #4, MUL VL]\n"
+      "zip2 z16.h, z25.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #5, MUL VL]\n"
+      "zip2 z17.h, z31.h, z23.h\n"
+      "zip1 z16.h, z24.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #6, MUL VL]\n"
+      "zip2 z16.h, z24.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #7, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "zip1 z18.h, z30.h, z21.h\n"
+      "zip2 z17.h, z30.h, z21.h\n"
+      "zip1 z16.h, z22.h, z18.h\n"
+      "st1h { z16.h }, p2, [x10]\n"
+      "zip2 z16.h, z22.h, z18.h\n"
+      "st1h { z16.h }, p2, [x10, #1, MUL VL]\n"
+      "zip1 z16.h, z20.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #2, MUL VL]\n"
+      "zip2 z16.h, z20.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #3, MUL VL]\n"
+      "zip1 z18.h, z1.h, z26.h\n"
+      "zip1 z17.h, z29.h, z19.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #4, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #5, MUL VL]\n"
+      "zip2 z18.h, z1.h, z26.h\n"
+      "zip2 z17.h, z29.h, z19.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #6, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #7, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "bge 8b\n"
+      "9:"  // Tail row loop: Unroll column loop skip
+      "cbz x20, 11f\n"
+      "10:"  // Tail row loop: Column loop
+      "mov x19, x20\n"
+      "decd x20, ALL, MUL #8\n"
+      "whilelt p0.h, XZR, x19\n"
+      "ld1h { z17.h }, p0/Z, [x11]\n"
+      "ld1h { z25.h }, p0/Z, [x9]\n"
+      "dech x19\n"
+      "ld1h { z16.h }, p0/Z, [x28]\n"
+      "zip1 z18.h, z17.h, z16.h\n"
+      "ld1h { z24.h }, p0/Z, [x27]\n"
+      "whilelt p0.h, XZR, x19\n"
+      "zip2 z23.h, z17.h, z16.h\n"
+      "ld1h { z22.h }, p0/Z, [x11, #1, MUL VL]\n"
+      "addvl x11, x11, #2\n"
+      "zip1 z16.h, z25.h, z24.h\n"
+      "ld1h { z21.h }, p0/Z, [x9, #1, MUL VL]\n"
+      "addvl x9, x9, #2\n"
+      "zip1 z17.h, z18.h, z16.h\n"
+      "ld1h { z20.h }, p0/Z, [x28, #1, MUL VL]\n"
+      "addvl x28, x28, #2\n"
+      "zip2 z18.h, z18.h, z16.h\n"
+      "ld1h { z19.h }, p0/Z, [x27, #1, MUL VL]\n"
+      "addvl x27, x27, #2\n"
+      "zip2 z16.h, z25.h, z24.h\n"
+      "st1h { z17.h }, p2, [x10]\n"
+      "cmp x20, #0x0\n"
+      "zip1 z17.h, z23.h, z16.h\n"
+      "st1h { z18.h }, p2, [x10, #1, MUL VL]\n"
+      "zip2 z16.h, z23.h, z16.h\n"
+      "st1h { z17.h }, p2, [x10, #2, MUL VL]\n"
+      "zip1 z18.h, z22.h, z20.h\n"
+      "st1h { z16.h }, p2, [x10, #3, MUL VL]\n"
+      "zip1 z17.h, z21.h, z19.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #4, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #5, MUL VL]\n"
+      "zip2 z18.h, z22.h, z20.h\n"
+      "zip2 z17.h, z21.h, z19.h\n"
+      "zip1 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #6, MUL VL]\n"
+      "zip2 z16.h, z18.h, z17.h\n"
+      "st1h { z16.h }, p2, [x10, #7, MUL VL]\n"
+      "add x10, x10, %x[out_stride]\n"
+      "bgt 10b\n"
+      "11:"  // Tail row loop: Column loop skip
+      "addvl %x[out], %x[out], #8\n"
+      "cmp %x[height], #0x1\n"
+      "bge 7b\n"
+      "12:"  // Done
+
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+
+template<>
+void Transform<8, 4, true, VLType::SVE>(
+    bfloat16 *out, const bfloat16 *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_2x4(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *>(in + k0 * stride + x0),
+        (xmax-x0) * sizeof(bfloat16) / 2,
+        stride * sizeof(bfloat16),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
new file mode 100644
index 0000000000..1313479dbc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_transpose_interleave_8VL_2x4_fp32bf16.hpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+namespace {
+
+void sve_transpose_interleave_8VL_2x4_fp32bf16(bfloat16 *out, const float *in, size_t width, size_t in_stride, size_t height)
+{
+    float *pad_row = reinterpret_cast<float *>(alloca(width * sizeof(float)));
+
+    if (height % 4) {
+        memset(pad_row, 0, width * sizeof(float));
+    }
+
+    size_t out_stride = 8 * roundup<size_t>(height, 4) * get_vector_length<uint32_t>();
+
+    __asm__ __volatile__(
+      "ptrue p4.b\n"
+      "1:"  // Main row loop: Head
+      "mov x25, %x[in]\n"
+      "add x24, x25, %x[in_stride]\n"
+      "add x23, x24, %x[in_stride]\n"
+      "mov x22, %x[width]\n"
+      "cnth x19, ALL, MUL #4\n"
+      "add x21, x23, %x[in_stride]\n"
+      "cmp %x[height], #0x3\n"
+      "add %x[in], x21, %x[in_stride]\n"
+      "csel x21, x21, %x[pad_row], GT\n"
+      "csel x23, x23, %x[pad_row], GE\n"
+      "cmp %x[height], #0x1\n"
+      "csel x24, x24, %x[pad_row], GT\n"
+      "cmp x22, x19\n"
+      "mov x20, %x[out]\n"
+      "sub %x[height], %x[height], #0x4\n"
+      "blt 3f\n"
+      "2:"  // Main row loop: Unroll column loop
+      "ld1w { z19.s }, p4/Z, [x25]\n"
+      "ld1w { z18.s }, p4/Z, [x25, #1, MUL VL]\n"
+      "sub x22, x22, x19\n"
+      "cmp x22, x19\n"
+      "ld1w { z20.s }, p4/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z24.s }, p4/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z23.s }, p4/Z, [x23]\n"
+      "ld1w { z17.s }, p4/Z, [x23, #1, MUL VL]\n"
+      "zip1 z22.s, z19.s, z23.s\n"
+      "zip2 z21.s, z19.s, z23.s\n"
+      "ld1w { z31.s }, p4/Z, [x23, #2, MUL VL]\n"
+      "ld1w { z16.s }, p4/Z, [x23, #3, MUL VL]\n"
+      "zip1 z9.s, z18.s, z17.s\n"
+      "zip2 z7.s, z18.s, z17.s\n"
+      "ld1w { z19.s }, p4/Z, [x25, #4, MUL VL]\n"
+      "ld1w { z18.s }, p4/Z, [x25, #5, MUL VL]\n"
+      "zip1 z6.s, z20.s, z31.s\n"
+      "zip2 z5.s, z20.s, z31.s\n"
+      "ld1w { z15.s }, p4/Z, [x25, #6, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x25, #7, MUL VL]\n"
+      "zip1 z3.s, z24.s, z16.s\n"
+      "zip2 z2.s, z24.s, z16.s\n"
+      "ld1w { z16.s }, p4/Z, [x23, #4, MUL VL]\n"
+      "ld1w { z17.s }, p4/Z, [x23, #5, MUL VL]\n"
+      "zip1 z1.s, z19.s, z16.s\n"
+      "zip2 z0.s, z19.s, z16.s\n"
+      "ld1w { z16.s }, p4/Z, [x23, #6, MUL VL]\n"
+      "ld1w { z19.s }, p4/Z, [x23, #7, MUL VL]\n"
+      "zip1 z31.s, z18.s, z17.s\n"
+      "zip2 z30.s, z18.s, z17.s\n"
+      "ld1w { z18.s }, p4/Z, [x24]\n"
+      "ld1w { z17.s }, p4/Z, [x24, #1, MUL VL]\n"
+      "zip1 z29.s, z15.s, z16.s\n"
+      "zip2 z28.s, z15.s, z16.s\n"
+      "ld1w { z16.s }, p4/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z23.s }, p4/Z, [x24, #3, MUL VL]\n"
+      "zip1 z27.s, z20.s, z19.s\n"
+      "zip2 z26.s, z20.s, z19.s\n"
+      "ld1w { z11.s }, p4/Z, [x21]\n"
+      "ld1w { z8.s }, p4/Z, [x21, #1, MUL VL]\n"
+      ".inst 0x658ab2d8  // bfcvt z24.h, p4/M, z22.s\n"
+      "zip1 z25.s, z18.s, z11.s\n"
+      "ld1w { z4.s }, p4/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z22.s }, p4/Z, [x21, #3, MUL VL]\n"
+      ".inst 0x658ab2af  // bfcvt z15.h, p4/M, z21.s\n"
+      "zip2 z14.s, z18.s, z11.s\n"
+      "ld1w { z21.s }, p4/Z, [x24, #4, MUL VL]\n"
+      "ld1w { z20.s }, p4/Z, [x24, #5, MUL VL]\n"
+      ".inst 0x658ab12d  // bfcvt z13.h, p4/M, z9.s\n"
+      "zip1 z12.s, z17.s, z8.s\n"
+      "ld1w { z11.s }, p4/Z, [x24, #6, MUL VL]\n"
+      "ld1w { z10.s }, p4/Z, [x24, #7, MUL VL]\n"
+      ".inst 0x658ab0e9  // bfcvt z9.h, p4/M, z7.s\n"
+      "zip2 z8.s, z17.s, z8.s\n"
+      "ld1w { z19.s }, p4/Z, [x21, #4, MUL VL]\n"
+      "ld1w { z18.s }, p4/Z, [x21, #5, MUL VL]\n"
+      ".inst 0x658ab0c7  // bfcvt z7.h, p4/M, z6.s\n"
+      "zip1 z6.s, z16.s, z4.s\n"
+      "ld1w { z17.s }, p4/Z, [x21, #6, MUL VL]\n"
+      ".inst 0x658ab0a5  // bfcvt z5.h, p4/M, z5.s\n"
+      "zip2 z4.s, z16.s, z4.s\n"
+      "ld1w { z16.s }, p4/Z, [x21, #7, MUL VL]\n"
+      ".inst 0x658ab063  // bfcvt z3.h, p4/M, z3.s\n"
+      ".inst 0x658ab042  // bfcvt z2.h, p4/M, z2.s\n"
+      "addvl x25, x25, #8\n"
+      "addvl x24, x24, #8\n"
+      ".inst 0x658ab021  // bfcvt z1.h, p4/M, z1.s\n"
+      ".inst 0x658ab000  // bfcvt z0.h, p4/M, z0.s\n"
+      "addvl x23, x23, #8\n"
+      "addvl x21, x21, #8\n"
+      ".inst 0x658ab3ff  // bfcvt z31.h, p4/M, z31.s\n"
+      ".inst 0x658ab3de  // bfcvt z30.h, p4/M, z30.s\n"
+      ".inst 0x658ab3bd  // bfcvt z29.h, p4/M, z29.s\n"
+      ".inst 0x658ab39c  // bfcvt z28.h, p4/M, z28.s\n"
+      ".inst 0x658ab37b  // bfcvt z27.h, p4/M, z27.s\n"
+      ".inst 0x658ab35a  // bfcvt z26.h, p4/M, z26.s\n"
+      ".inst 0x648ab338  // bfcvtnt z24.h, p4/M, z25.s\n"
+      "zip1 z25.s, z23.s, z22.s\n"
+      "st1h { z24.h }, p4, [x20]\n"
+      "zip2 z24.s, z23.s, z22.s\n"
+      "zip1 z23.s, z21.s, z19.s\n"
+      "zip2 z22.s, z21.s, z19.s\n"
+      "zip1 z21.s, z20.s, z18.s\n"
+      "zip2 z20.s, z20.s, z18.s\n"
+      "zip1 z19.s, z11.s, z17.s\n"
+      "zip2 z18.s, z11.s, z17.s\n"
+      "zip1 z17.s, z10.s, z16.s\n"
+      "zip2 z16.s, z10.s, z16.s\n"
+      ".inst 0x648ab1cf  // bfcvtnt z15.h, p4/M, z14.s\n"
+      "st1h { z15.h }, p4, [x20, #1, MUL VL]\n"
+      ".inst 0x648ab18d  // bfcvtnt z13.h, p4/M, z12.s\n"
+      ".inst 0x648ab109  // bfcvtnt z9.h, p4/M, z8.s\n"
+      "st1h { z13.h }, p4, [x20, #2, MUL VL]\n"
+      ".inst 0x648ab0c7  // bfcvtnt z7.h, p4/M, z6.s\n"
+      ".inst 0x648ab085  // bfcvtnt z5.h, p4/M, z4.s\n"
+      "st1h { z9.h }, p4, [x20, #3, MUL VL]\n"
+      ".inst 0x648ab323  // bfcvtnt z3.h, p4/M, z25.s\n"
+      ".inst 0x648ab302  // bfcvtnt z2.h, p4/M, z24.s\n"
+      "st1h { z7.h }, p4, [x20, #4, MUL VL]\n"
+      "st1h { z5.h }, p4, [x20, #5, MUL VL]\n"
+      ".inst 0x648ab2e1  // bfcvtnt z1.h, p4/M, z23.s\n"
+      ".inst 0x648ab2c0  // bfcvtnt z0.h, p4/M, z22.s\n"
+      "st1h { z3.h }, p4, [x20, #6, MUL VL]\n"
+      ".inst 0x648ab2bf  // bfcvtnt z31.h, p4/M, z21.s\n"
+      ".inst 0x648ab29e  // bfcvtnt z30.h, p4/M, z20.s\n"
+      "st1h { z2.h }, p4, [x20, #7, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      ".inst 0x648ab27d  // bfcvtnt z29.h, p4/M, z19.s\n"
+      ".inst 0x648ab25c  // bfcvtnt z28.h, p4/M, z18.s\n"
+      ".inst 0x648ab23b  // bfcvtnt z27.h, p4/M, z17.s\n"
+      ".inst 0x648ab21a  // bfcvtnt z26.h, p4/M, z16.s\n"
+      "st1h { z1.h }, p4, [x20]\n"
+      "st1h { z0.h }, p4, [x20, #1, MUL VL]\n"
+      "st1h { z31.h }, p4, [x20, #2, MUL VL]\n"
+      "st1h { z30.h }, p4, [x20, #3, MUL VL]\n"
+      "st1h { z29.h }, p4, [x20, #4, MUL VL]\n"
+      "st1h { z28.h }, p4, [x20, #5, MUL VL]\n"
+      "st1h { z27.h }, p4, [x20, #6, MUL VL]\n"
+      "st1h { z26.h }, p4, [x20, #7, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bge 2b\n"
+      "3:"  // Main row loop: Unroll column loop skip
+      "cbz x22, 5f\n"
+      "4:"  // Main row loop: Column loop
+      "mov x19, x22\n"
+      "whilelt p3.s, XZR, x19\n"
+      "ld1w { z22.s }, p3/Z, [x25]\n"
+      "ld1w { z21.s }, p3/Z, [x23]\n"
+      "decw x19\n"
+      "whilelt p2.s, XZR, x19\n"
+      "ld1w { z20.s }, p2/Z, [x25, #1, MUL VL]\n"
+      "ld1w { z19.s }, p2/Z, [x23, #1, MUL VL]\n"
+      "decw x19\n"
+      "whilelt p1.s, XZR, x19\n"
+      "ld1w { z18.s }, p1/Z, [x25, #2, MUL VL]\n"
+      "ld1w { z17.s }, p1/Z, [x23, #2, MUL VL]\n"
+      "decw x19\n"
+      "whilelt p0.s, XZR, x19\n"
+      "ld1w { z28.s }, p0/Z, [x25, #3, MUL VL]\n"
+      "ld1w { z16.s }, p0/Z, [x23, #3, MUL VL]\n"
+      "ld1w { z27.s }, p3/Z, [x24]\n"
+      "ld1w { z3.s }, p2/Z, [x24, #1, MUL VL]\n"
+      "zip1 z26.s, z22.s, z21.s\n"
+      "zip2 z25.s, z22.s, z21.s\n"
+      "ld1w { z2.s }, p1/Z, [x24, #2, MUL VL]\n"
+      "ld1w { z1.s }, p0/Z, [x24, #3, MUL VL]\n"
+      "zip1 z24.s, z20.s, z19.s\n"
+      "zip2 z23.s, z20.s, z19.s\n"
+      "ld1w { z22.s }, p3/Z, [x21]\n"
+      "ld1w { z21.s }, p2/Z, [x21, #1, MUL VL]\n"
+      "zip1 z20.s, z18.s, z17.s\n"
+      "zip2 z19.s, z18.s, z17.s\n"
+      "ld1w { z18.s }, p1/Z, [x21, #2, MUL VL]\n"
+      "ld1w { z0.s }, p0/Z, [x21, #3, MUL VL]\n"
+      "zip1 z17.s, z28.s, z16.s\n"
+      "zip2 z16.s, z28.s, z16.s\n"
+      "decd x22, ALL, MUL #8\n"
+      ".inst 0x658ab35f  // bfcvt z31.h, p4/M, z26.s\n"
+      "zip1 z30.s, z27.s, z22.s\n"
+      "cmp x22, #0x0\n"
+      ".inst 0x658ab33d  // bfcvt z29.h, p4/M, z25.s\n"
+      "zip2 z28.s, z27.s, z22.s\n"
+      "addvl x25, x25, #4\n"
+      "addvl x24, x24, #4\n"
+      ".inst 0x658ab31b  // bfcvt z27.h, p4/M, z24.s\n"
+      "zip1 z26.s, z3.s, z21.s\n"
+      "addvl x23, x23, #4\n"
+      "addvl x21, x21, #4\n"
+      ".inst 0x658ab2f9  // bfcvt z25.h, p4/M, z23.s\n"
+      "zip2 z24.s, z3.s, z21.s\n"
+      ".inst 0x658ab297  // bfcvt z23.h, p4/M, z20.s\n"
+      "zip1 z22.s, z2.s, z18.s\n"
+      ".inst 0x658ab275  // bfcvt z21.h, p4/M, z19.s\n"
+      "zip2 z20.s, z2.s, z18.s\n"
+      ".inst 0x658ab233  // bfcvt z19.h, p4/M, z17.s\n"
+      "zip1 z18.s, z1.s, z0.s\n"
+      ".inst 0x658ab211  // bfcvt z17.h, p4/M, z16.s\n"
+      "zip2 z16.s, z1.s, z0.s\n"
+      ".inst 0x648ab3df  // bfcvtnt z31.h, p4/M, z30.s\n"
+      ".inst 0x648ab39d  // bfcvtnt z29.h, p4/M, z28.s\n"
+      "st1h { z31.h }, p4, [x20]\n"
+      ".inst 0x648ab35b  // bfcvtnt z27.h, p4/M, z26.s\n"
+      ".inst 0x648ab319  // bfcvtnt z25.h, p4/M, z24.s\n"
+      "st1h { z29.h }, p4, [x20, #1, MUL VL]\n"
+      ".inst 0x648ab2d7  // bfcvtnt z23.h, p4/M, z22.s\n"
+      ".inst 0x648ab295  // bfcvtnt z21.h, p4/M, z20.s\n"
+      "st1h { z27.h }, p4, [x20, #2, MUL VL]\n"
+      ".inst 0x648ab253  // bfcvtnt z19.h, p4/M, z18.s\n"
+      ".inst 0x648ab211  // bfcvtnt z17.h, p4/M, z16.s\n"
+      "st1h { z25.h }, p4, [x20, #3, MUL VL]\n"
+      "st1h { z23.h }, p4, [x20, #4, MUL VL]\n"
+      "st1h { z21.h }, p4, [x20, #5, MUL VL]\n"
+      "st1h { z19.h }, p4, [x20, #6, MUL VL]\n"
+      "st1h { z17.h }, p4, [x20, #7, MUL VL]\n"
+      "add x20, x20, %x[out_stride]\n"
+      "bgt 4b\n"
+      "5:"  // Main row loop: Column loop skip
+      "cmp %x[height], #0x1\n"
+      "addvl %x[out], %x[out], #8\n"
+      "bge 1b\n"
+      : [height] "+&r" (height), [in] "+&r" (in), [out] "+&r" (out)
+      : [in_stride] "r" (in_stride), [out_stride] "r" (out_stride), [pad_row] "r" (pad_row), [width] "r" (width)
+      : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+    );
+}
+
+} // anonymous namespace
+template<>
+void Transform<8, 4, true, VLType::SVE>(
+    bfloat16 *out, const float *in, int stride, int x0, int xmax, int k0, int kmax)
+{
+    sve_transpose_interleave_8VL_2x4_fp32bf16(
+        out,
+        in + k0 * stride + x0,
+        (xmax-x0),
+        stride * sizeof(float),
+        (kmax-k0)
+    );
+}
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
index a3216c494f..02367bd7e7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@ struct TransposeInterleaveCommon {
     }
   }
 
-  static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) {
+  static void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) {
     const auto ldin = stride;
 
     TOut *outarray = out;
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index e648ce2fb5..4ba03da6e7 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,29 @@
 namespace arm_gemm {
 
 template<typename T>
+std::string get_type_name() {
+#ifdef __GNUC__
+    std::string s = __PRETTY_FUNCTION__;
+
+    auto start = s.find("cls_");
+
+    if (start==std::string::npos) {
+        return "(unknown)";
+    }
+
+    for(size_t x = start+4; x<s.size(); x++) {
+        if (s[x] == ';' || s[x] == ']') {
+            return s.substr(start+4, x-(start+4));
+        }
+    }
+
+    return "(unknown)";
+#else
+    return "(unsupported)";
+#endif
+}
+
+template<typename T>
 inline T iceildiv(const T a, const T b) {
     return (a + b - 1) / b;
 }
diff --git a/src/core/cpu/kernels/assembly/arm_gemm.hpp b/src/core/cpu/kernels/assembly/arm_gemm.hpp
index 81e355d6b3..e38cc09202 100644
--- a/src/core/cpu/kernels/assembly/arm_gemm.hpp
+++ b/src/core/cpu/kernels/assembly/arm_gemm.hpp
@@ -44,9 +44,7 @@ enum class GemmMethod
     GEMM_INTERLEAVED_2D,
     QUANTIZE_WRAPPER,
     QUANTIZE_WRAPPER_2D,
-    GEMM_HYBRID_QUANTIZED,
-    INDIRECT_GEMM,
-    CONVOLUTION_GEMM
+    GEMM_HYBRID_QUANTIZED
 };
 
 struct KernelDescription
@@ -113,13 +111,15 @@ public:
     bool              _indirect_input;
     Activation        _act;
     int               _maxthreads;
+    bool              _fast_mode;
     const GemmConfig *_cfg;
 
     GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N,
              unsigned int K, unsigned int Ksections, unsigned int nbatches,
              unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads,
-             const GemmConfig *cfg = nullptr)
-        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), _cfg(cfg)
+             bool fast_mode = false, const GemmConfig *cfg = nullptr)
+        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), _fast_mode(fast_mode),
+          _cfg(cfg)
     {
     }
 };
diff --git a/src/core/cpu/kernels/assembly/gemm_common.hpp b/src/core/cpu/kernels/assembly/gemm_common.hpp
index 4af85ed663..378f1041be 100644
--- a/src/core/cpu/kernels/assembly/gemm_common.hpp
+++ b/src/core/cpu/kernels/assembly/gemm_common.hpp
@@ -30,6 +30,9 @@
 
 namespace arm_gemm
 {
+// Avoid circular dependency with arm_gemm.hpp
+struct GemmConfig;
+
 // Abstract class for the GEMM/GEMV functions.
 //
 // GEMM implementations may be "native" (never require any input
@@ -137,6 +140,10 @@ public:
     {
     }
 
+    /*** Introspection interface ***/
+    /* Get the configuration of this GEMM */
+    virtual GemmConfig get_config() = 0;
+
     // Destructor
     virtual ~IGemmCommon()
     {
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index bc9a3056e8..0647a473e2 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -101,6 +101,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
                                          fc_info.retain_internal_weights, // retain_internal_weights
                                          gemmlowp_output_stage,           // gemmlowp_output_stage
                                          fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                         false,                           // fast_math
                                          true,                            // broadcast_bias
                                          ActivationLayerInfo());          // activation_info
 
@@ -151,6 +152,7 @@ void CLFullyConnectedLayer::configure_mm(const CLCompileContext &compile_context
                                          fc_info.retain_internal_weights, // retain_internal_weights
                                          gemmlowp_output_stage,           // gemmlowp_output_stage
                                          fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                         false,                           // fast_math
                                          true,                            // broadcast_bias
                                          fc_info.activation_info,         // activation_info
                                          fc_info.constant_weights);       // constant_weights
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index f926b1d0a6..16735dde0e 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -128,6 +128,7 @@ void CLGEMMConvolutionLayer::configure_mm(const CLCompileContext &compile_contex
                                          false,                 // retain_internal_weights
                                          gemmlowp_output_stage, // gemmlowp_output_stage
                                          false,                 // fp_mixed_precision
+                                         false,                 // fast_math
                                          true,                  // broadcast_bias
                                          act_info);             // activation_info
 
@@ -167,6 +168,7 @@ Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens
                                          false,                 // retain_internal_weights
                                          gemmlowp_output_stage, // gemmlowp_output_stage
                                          false,                 // fp_mixed_precision
+                                         false,                 // fast_math
                                          true,                  // broadcast_bias
                                          act_info);             // activation_info
 
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 1022e397d0..e88bd3b5d4 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -67,7 +67,7 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
         case ConvolutionMethod::GEMM:
         {
             auto f = std::make_unique<NEGEMMConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
+            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math);
             _function = std::move(f);
             break;
         }
diff --git a/src/runtime/cpu/operators/CpuGemm.cpp b/src/runtime/cpu/operators/CpuGemm.cpp
index 9a4d171ce6..c6abe1f893 100644
--- a/src/runtime/cpu/operators/CpuGemm.cpp
+++ b/src/runtime/cpu/operators/CpuGemm.cpp
@@ -48,6 +48,7 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
     asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
     asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
     asm_info.activation_info         = info.activation_info();
+    asm_info.fast_mode               = info.fast_math();
 
     return asm_info;
 }
diff --git a/src/runtime/cpu/operators/CpuGemmConvolution.cpp b/src/runtime/cpu/operators/CpuGemmConvolution.cpp
index a0424b1c63..fcdf8aa8f6 100644
--- a/src/runtime/cpu/operators/CpuGemmConvolution.cpp
+++ b/src/runtime/cpu/operators/CpuGemmConvolution.cpp
@@ -66,7 +66,7 @@ void CpuGemmConvolution::configure_mm(const ITensorInfo *src, const ITensorInfo
     // Create GEMMInfo structure
     const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                          gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                         false, GEMMLowpOutputStageInfo(), false, false, act_info);
+                                         false, GEMMLowpOutputStageInfo(), false, false, false, act_info);
 
     // Supported activations in GEMM
     const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
@@ -115,7 +115,7 @@ void CpuGemmConvolution::configure_mm(const ITensorInfo *src, const ITensorInfo
         quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
 
         _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
-        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, act_info));
+        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, false, act_info));
 
         auto mm_mem_req = _mm_gemmlowp->workspace();
         for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
@@ -146,7 +146,7 @@ Status CpuGemmConvolution::validate_mm(const ITensorInfo *src, const ITensorInfo
     // Create GEMMInfo structure
     const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
                                         gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                        false, GEMMLowpOutputStageInfo(), false, false, act_info);
+                                        false, GEMMLowpOutputStageInfo(), false, false, false, act_info);
 
     if(is_quantized)
     {
@@ -186,7 +186,8 @@ Status CpuGemmConvolution::validate_mm(const ITensorInfo *src, const ITensorInfo
         std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
         input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
         weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
-        return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, act_info));
+        return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, false,
+                                                                                                               act_info));
     }
     else
     {
diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
index c2e9f24ff6..10eece99eb 100644
--- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
+++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
@@ -86,6 +86,7 @@ cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect
     asm_info.padding_left            = info.conv_info.pad_left();
     asm_info.padding_value           = 0.f;
     asm_info.negated_offsets         = false;
+    asm_info.fast_mode               = info.enable_fast_math;
     return asm_info;
 }
 } // namespace
diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
index 651ce436a0..56eb4fbb87 100644
--- a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -63,6 +63,7 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
     asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
     asm_info.activation_info         = info.activation_info();
     asm_info.output_stage            = info.gemmlowp_output_stage();
+    asm_info.fast_mode               = info.fast_math();
 
     return asm_info;
 }
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 79ea1cb5a7..bbbd5ac458 100644
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -542,7 +542,7 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
     const CPUInfo &ci          = NEScheduler::get().cpu_info();
     unsigned int   num_threads = NEScheduler::get().num_threads();
 
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode);
 
     // Create arm_gemm fallback
     auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
@@ -556,11 +556,11 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
                            arm_gemm::Activation activation, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_UNUSED(activation);
-    Params         p           = extract_parameters(a, b, d, info);
-    const CPUInfo &ci          = NEScheduler::get().cpu_info();
-    unsigned int   num_threads = NEScheduler::get().num_threads();
+    Params             p           = extract_parameters(a, b, d, info);
+    const CPUInfo     &ci          = NEScheduler::get().cpu_info();
+    const unsigned int num_threads = NEScheduler::get().num_threads();
 
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode);
 
     // Create arm_gemm fallback
     auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
index 355273adeb..88cfed002a 100644
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
@@ -51,6 +51,7 @@ struct AsmGemmInfo
     int64_t                 padding_top{ 0 };
     int64_t                 padding_left{ 0 };
     float                   padding_value{ 0.f };
+    bool                    fast_mode{ false };
 };
 
 /** Assembly kernel glue */
diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index c118da66ae..5f5fa3b653 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h
@@ -98,7 +98,7 @@ protected:
                        (disable_c) ? nullptr : &c,
                        &dst,
                        alpha, beta,
-                       GEMMInfo(false, false, false, (reinterpret_output_as_3d ? output_shape[2] : 0), reinterpret_input_as_3d, false, GEMMLowpOutputStageInfo(), false, (reinterpret_input_as_3d
+                       GEMMInfo(false, false, false, (reinterpret_output_as_3d ? output_shape[2] : 0), reinterpret_input_as_3d, false, GEMMLowpOutputStageInfo(), false, false, (reinterpret_input_as_3d
                                 || reinterpret_output_as_3d)));
         ARM_COMPUTE_ASSERT(a.info()->is_resizable());
         ARM_COMPUTE_ASSERT(b.info()->is_resizable());
author	Georgios Pinitas <georgios.pinitas@arm.com>	2021-07-16 16:16:43 +0100
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2021-07-22 02:25:50 +0000
commit	4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2 (patch)
tree	2f8362d33cdad4212f4b96995681c68184c759e1
parent	59fd7a722e5bc7e85309d6200bc37a772721a719 (diff)
download	ComputeLibrary-4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2.tar.gz